//===--- Lexer.cpp - Swift Language Lexer ---------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
//
//  This file implements the Lexer and Token interfaces.
//
//===----------------------------------------------------------------------===//

#include "polarphp/llparser/Confusables.h"
#include "polarphp/llparser/Lexer.h"
#include "polarphp/ast/DiagnosticsParse.h"
#include "polarphp/ast/Identifier.h"
#include "polarphp/basic/LangOptions.h"
#include "polarphp/basic/SourceMgr.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/ADT/Twine.h"
// FIXME: Figure out if this can be migrated to LLVM.
#include "clang/Basic/CharInfo.h"

#include <limits>

namespace polar::llparser {

// clang::isIdentifierHead and clang::isIdentifierBody are deliberately not in
// this list as a reminder that they are using C rules for identifiers.
// (Admittedly these are the same as Swift's right now.)
using clang::isAlphanumeric;
using clang::isDigit;
using clang::isHexDigit;
using clang::isHorizontalWhitespace;
using clang::isPrintable;
using clang::isWhitespace;
using namespace polar;

//===----------------------------------------------------------------------===//
// UTF8 Validation/Encoding/Decoding helper functions
//===----------------------------------------------------------------------===//

/// EncodeToUTF8 - Encode the specified code point into a UTF8 stream.  Return
/// true if it is an erroneous code point.
static bool EncodeToUTF8(unsigned CharValue,
                         SmallVectorImpl<char> &Result) {
   // Number of bits in the value, ignoring leading zeros.
   unsigned NumBits = 32 - llvm::countLeadingZeros(CharValue);

   // Handle the leading byte, based on the number of bits in the value.
   unsigned NumTrailingBytes;
   if (NumBits <= 5 + 6) {
      // Encoding is 0x110aaaaa 10bbbbbb
      Result.push_back(char(0xC0 | (CharValue >> 6)));
      NumTrailingBytes = 1;
   } else if (NumBits <= 4 + 6 + 6) {
      // Encoding is 0x1110aaaa 10bbbbbb 10cccccc
      Result.push_back(char(0xE0 | (CharValue >> (6 + 6))));
      NumTrailingBytes = 2;

      // UTF-16 surrogate pair values are not valid code points.
      if (CharValue >= 0xD800 && CharValue <= 0xDFFF)
         return true;
      // U+FDD0...U+FDEF are also reserved
      if (CharValue >= 0xFDD0 && CharValue <= 0xFDEF)
         return true;
   } else if (NumBits <= 3 + 6 + 6 + 6) {
      // Encoding is 0x11110aaa 10bbbbbb 10cccccc 10dddddd
      Result.push_back(char(0xF0 | (CharValue >> (6 + 6 + 6))));
      NumTrailingBytes = 3;
      // Reject over-large code points.  These cannot be encoded as UTF-16
      // surrogate pairs, so UTF-32 doesn't allow them.
      if (CharValue > 0x10FFFF)
         return true;
   } else {
      return true;  // UTF8 can encode these, but they aren't valid code points.
   }

   // Emit all of the trailing bytes.
   while (NumTrailingBytes--)
      Result.push_back(char(0x80 | (0x3F & (CharValue >> (NumTrailingBytes * 6)))));
   return false;
}


/// CLO8 - Return the number of leading ones in the specified 8-bit value.
static unsigned CLO8(unsigned char C) {
   return llvm::countLeadingOnes(uint32_t(C) << 24);
}

/// isStartOfUTF8Character - Return true if this isn't a UTF8 continuation
/// character, which will be of the form 0b10XXXXXX
static bool isStartOfUTF8Character(unsigned char C) {
   // RFC 2279: The octet values FE and FF never appear.
   // RFC 3629: The octet values C0, C1, F5 to FF never appear.
   return C <= 0x80 || (C >= 0xC2 && C < 0xF5);
}

/// validateUTF8CharacterAndAdvance - Given a pointer to the starting byte of a
/// UTF8 character, validate it and advance the lexer past it.  This returns the
/// encoded character or ~0U if the encoding is invalid.
uint32_t validateUTF8CharacterAndAdvance(const char *&Ptr,
                                                const char *End) {
   if (Ptr >= End)
      return ~0U;

   unsigned char CurByte = *Ptr++;
   if (CurByte < 0x80)
      return CurByte;

   // Read the number of high bits set, which indicates the number of bytes in
   // the character.
   unsigned EncodedBytes = CLO8(CurByte);

   // If this is 0b10XXXXXX, then it is a continuation character.
   if (EncodedBytes == 1 ||
       !isStartOfUTF8Character(CurByte)) {
      // Skip until we get the start of another character.  This is guaranteed to
      // at least stop at the nul at the end of the buffer.
      while (Ptr < End && !isStartOfUTF8Character(*Ptr))
         ++Ptr;
      return ~0U;
   }

   // Drop the high bits indicating the # bytes of the result.
   unsigned CharValue = (unsigned char) (CurByte << EncodedBytes) >> EncodedBytes;

   // Read and validate the continuation bytes.
   for (unsigned i = 1; i != EncodedBytes; ++i) {
      if (Ptr >= End)
         return ~0U;
      CurByte = *Ptr;
      // If the high bit isn't set or the second bit isn't clear, then this is not
      // a continuation byte!
      if (CurByte < 0x80 || CurByte >= 0xC0) return ~0U;

      // Accumulate our result.
      CharValue <<= 6;
      CharValue |= CurByte & 0x3F;
      ++Ptr;
   }

   // UTF-16 surrogate pair values are not valid code points.
   if (CharValue >= 0xD800 && CharValue <= 0xDFFF)
      return ~0U;

   // If we got here, we read the appropriate number of accumulated bytes.
   // Verify that the encoding was actually minimal.
   // Number of bits in the value, ignoring leading zeros.
   unsigned NumBits = 32 - llvm::countLeadingZeros(CharValue);

   if (NumBits <= 5 + 6)
      return EncodedBytes == 2 ? CharValue : ~0U;
   if (NumBits <= 4 + 6 + 6)
      return EncodedBytes == 3 ? CharValue : ~0U;
   return EncodedBytes == 4 ? CharValue : ~0U;
}

//===----------------------------------------------------------------------===//
// Setup and Helper Methods
//===----------------------------------------------------------------------===//

Lexer::Lexer(const PrincipalTag &, const LangOptions &LangOpts,
             const SourceManager &SourceMgr, unsigned BufferID,
             DiagnosticEngine *Diags, LexerMode LexMode,
             HashbangMode HashbangAllowed, CommentRetentionMode RetainComments,
             TriviaRetentionMode TriviaRetention)
   : LangOpts(LangOpts), SourceMgr(SourceMgr), BufferID(BufferID),
     Diags(Diags), LexMode(LexMode),
     IsHashbangAllowed(HashbangAllowed == HashbangMode::Allowed),
     RetainComments(RetainComments), TriviaRetention(TriviaRetention) {}

void Lexer::initialize(unsigned Offset, unsigned EndOffset) {
   assert(Offset <= EndOffset);

   // Initialize buffer pointers.
   StringRef contents =
      SourceMgr.extractText(SourceMgr.getRangeForBuffer(BufferID));
   BufferStart = contents.data();
   BufferEnd = contents.data() + contents.size();
   assert(*BufferEnd == 0);
   assert(BufferStart + Offset <= BufferEnd);
   assert(BufferStart + EndOffset <= BufferEnd);

   // Check for Unicode BOM at start of file (Only UTF-8 BOM supported now).
   size_t BOMLength = contents.startswith("\xEF\xBB\xBF") ? 3 : 0;

   // Keep information about existance of UTF-8 BOM for transparency source code
   // editing with libSyntax.
   ContentStart = BufferStart + BOMLength;

   // Initialize code completion.
   if (BufferID == SourceMgr.getCodeCompletionBufferId()) {
      const char *Ptr = BufferStart + SourceMgr.getCodeCompletionOffset();
      if (Ptr >= BufferStart && Ptr <= BufferEnd)
         CodeCompletionPtr = Ptr;
   }

   ArtificialEOF = BufferStart + EndOffset;
   CurPtr = BufferStart + Offset;

   assert(NextToken.is(tok::NUM_TOKENS));
   lexImpl();
   assert((NextToken.isAtStartOfLine() || CurPtr != BufferStart) &&
          "The token should be at the beginning of the line, "
          "or we should be lexing from the middle of the buffer");
}

Lexer::Lexer(const LangOptions &Options, const SourceManager &SourceMgr,
             unsigned BufferID, DiagnosticEngine *Diags, LexerMode LexMode,
             HashbangMode HashbangAllowed, CommentRetentionMode RetainComments,
             TriviaRetentionMode TriviaRetention)
   : Lexer(PrincipalTag(), Options, SourceMgr, BufferID, Diags, LexMode,
           HashbangAllowed, RetainComments, TriviaRetention) {
   unsigned EndOffset = SourceMgr.getRangeForBuffer(BufferID).getByteLength();
   initialize(/*Offset=*/0, EndOffset);
}

Lexer::Lexer(const LangOptions &Options, const SourceManager &SourceMgr,
             unsigned BufferID, DiagnosticEngine *Diags, LexerMode LexMode,
             HashbangMode HashbangAllowed, CommentRetentionMode RetainComments,
             TriviaRetentionMode TriviaRetention, unsigned Offset,
             unsigned EndOffset)
   : Lexer(PrincipalTag(), Options, SourceMgr, BufferID, Diags, LexMode,
           HashbangAllowed, RetainComments, TriviaRetention) {
   initialize(Offset, EndOffset);
}

Lexer::Lexer(Lexer &Parent, State BeginState, State EndState)
   : Lexer(PrincipalTag(), Parent.LangOpts, Parent.SourceMgr, Parent.BufferID,
           Parent.Diags, Parent.LexMode,
           Parent.IsHashbangAllowed
           ? HashbangMode::Allowed
           : HashbangMode::Disallowed,
           Parent.RetainComments, Parent.TriviaRetention) {
   assert(BufferID == SourceMgr.findBufferContainingLoc(BeginState.Loc) &&
          "state for the wrong buffer");
   assert(BufferID == SourceMgr.findBufferContainingLoc(EndState.Loc) &&
          "state for the wrong buffer");

   unsigned Offset = SourceMgr.getLocOffsetInBuffer(BeginState.Loc, BufferID);
   unsigned EndOffset = SourceMgr.getLocOffsetInBuffer(EndState.Loc, BufferID);
   initialize(Offset, EndOffset);
}

InFlightDiagnostic Lexer::diagnose(const char *Loc, Diagnostic Diag) {
   if (Diags)
      return Diags->diagnose(getSourceLoc(Loc), Diag);

   return InFlightDiagnostic();
}

Token Lexer::getTokenAt(SourceLoc Loc) {
   assert(BufferID == static_cast<unsigned>(
      SourceMgr.findBufferContainingLoc(Loc)) &&
          "location from the wrong buffer");

   Lexer L(LangOpts, SourceMgr, BufferID, Diags, LexMode,
           HashbangMode::Allowed, CommentRetentionMode::None,
           TriviaRetentionMode::WithoutTrivia);
   L.restoreState(State(Loc));
   return L.peekNextToken();
}

void Lexer::formToken(tok Kind, const char *TokStart) {
   assert(CurPtr >= BufferStart &&
          CurPtr <= BufferEnd && "Current pointer out of range!");

   // When we are lexing a subrange from the middle of a file buffer, we will
   // run past the end of the range, but will stay within the file.  Check if
   // we are past the imaginary EOF, and synthesize a tok::eof in this case.
   if (Kind != tok::eof && TokStart >= ArtificialEOF) {
      Kind = tok::eof;
   }
   unsigned CommentLength = 0;
   if (RetainComments == CommentRetentionMode::AttachToNextToken) {
      // 'CommentLength' here is the length from the *first* comment to the
      // token text (or its backtick if exist).
      auto Iter = llvm::find_if(LeadingTrivia, [](const ParsedTriviaPiece &Piece) {
         return isCommentTriviaKind(Piece.getKind());
      });
      for (auto End = LeadingTrivia.end(); Iter != End; Iter++) {
         CommentLength += Iter->getLength();
      }
   }

   StringRef TokenText{TokStart, static_cast<size_t>(CurPtr - TokStart)};

   if (TriviaRetention == TriviaRetentionMode::WithTrivia && Kind != tok::eof) {
      assert(TrailingTrivia.empty() && "TrailingTrivia is empty here");
      lexTrivia(TrailingTrivia, /* IsForTrailingTrivia */ true);
   }

   NextToken.setToken(Kind, TokenText, CommentLength);
}

void Lexer::formEscapedIdentifierToken(const char *TokStart) {
   assert(CurPtr - TokStart >= 3 && "escaped identifier must be longer than or equal 3 bytes");
   assert(TokStart[0] == '`' && "escaped identifier starts with backtick");
   assert(CurPtr[-1] == '`' && "escaped identifier ends with backtick");

   formToken(tok::identifier, TokStart);
   // If this token is at ArtificialEOF, it's forced to be tok::eof. Don't mark
   // this as escaped-identifier in this case.
   if (NextToken.is(tok::eof))
      return;
   NextToken.setEscapedIdentifier(true);
}

static void validateMultilineIndents(const Token &Str, DiagnosticEngine *Diags);

void Lexer::formStringLiteralToken(const char *TokStart,
                                   bool IsMultilineString,
                                   unsigned CustomDelimiterLen) {
   formToken(tok::string_literal, TokStart);
   if (NextToken.is(tok::eof))
      return;
   NextToken.setStringLiteral(IsMultilineString, CustomDelimiterLen);

   if (IsMultilineString && Diags)
      validateMultilineIndents(NextToken, Diags);
}

Lexer::State Lexer::getStateForBeginningOfTokenLoc(SourceLoc Loc) const {
   const char *Ptr = getBufferPtrForSourceLoc(Loc);
   // Skip whitespace backwards until we hit a newline.  This is needed to
   // correctly lex the token if it is at the beginning of the line.
   while (Ptr >= ContentStart + 1) {
      char C = Ptr[-1];
      if (C == ' ' || C == '\t') {
         --Ptr;
         continue;
      }
      if (C == 0) {
         // A NUL character can be either whitespace we diagnose or a code
         // completion token.
         if (Ptr - 1 == CodeCompletionPtr)
            break;
         --Ptr;
         continue;
      }
      if (C == '\n' || C == '\r') {
         --Ptr;
         break;
      }
      break;
   }
   return State(SourceLoc(llvm::SMLoc::getFromPointer(Ptr)));
}

//===----------------------------------------------------------------------===//
// Lexer Subroutines
//===----------------------------------------------------------------------===//

static void diagnoseEmbeddedNul(DiagnosticEngine *Diags, const char *Ptr) {
   assert(Ptr && "invalid source location");
   assert(*Ptr == '\0' && "not an embedded null");

   if (!Diags)
      return;

   SourceLoc NulLoc = Lexer::getSourceLoc(Ptr);
   SourceLoc NulEndLoc = Lexer::getSourceLoc(Ptr + 1);
   Diags->diagnose(NulLoc, diag::lex_nul_character)
      .fixItRemoveChars(NulLoc, NulEndLoc);
}

/// Advance \p CurPtr to the end of line or the end of file. Returns \c true
/// if it stopped at the end of line, \c false if it stopped at the end of file.
static bool advanceToEndOfLine(const char *&CurPtr, const char *BufferEnd,
                               const char *CodeCompletionPtr = nullptr,
                               DiagnosticEngine *Diags = nullptr) {
   while (1) {
      switch (*CurPtr++) {
         case '\n':
         case '\r':
            --CurPtr;
            return true; // If we found the end of the line, return.
         default:
            // If this is a "high" UTF-8 character, validate it.
            if (Diags && (signed char) (CurPtr[-1]) < 0) {
               --CurPtr;
               const char *CharStart = CurPtr;
               if (validateUTF8CharacterAndAdvance(CurPtr, BufferEnd) == ~0U)
                  Diags->diagnose(Lexer::getSourceLoc(CharStart),
                                  diag::lex_invalid_utf8);
            }
            break;   // Otherwise, eat other characters.
         case 0:
            if (CurPtr - 1 != BufferEnd) {
               if (Diags && CurPtr - 1 != CodeCompletionPtr) {
                  // If this is a random nul character in the middle of a buffer, skip
                  // it as whitespace.
                  diagnoseEmbeddedNul(Diags, CurPtr - 1);
               }
               continue;
            }
            // Otherwise, the last line of the file does not have a newline.
            --CurPtr;
            return false;
      }
   }
}

void Lexer::skipToEndOfLine(bool EatNewline) {
   bool isEOL = advanceToEndOfLine(CurPtr, BufferEnd, CodeCompletionPtr, Diags);
   if (EatNewline && isEOL) {
      ++CurPtr;
      NextToken.setAtStartOfLine(true);
   }
}

void Lexer::skipSlashSlashComment(bool EatNewline) {
   assert(CurPtr[-1] == '/' && CurPtr[0] == '/' && "Not a // comment");
   skipToEndOfLine(EatNewline);
}

void Lexer::skipHashbang(bool EatNewline) {
   assert(CurPtr == ContentStart && CurPtr[0] == '#' && CurPtr[1] == '!' &&
          "Not a hashbang");
   skipToEndOfLine(EatNewline);
}

static bool skipToEndOfSlashStarComment(const char *&CurPtr,
                                        const char *BufferEnd,
                                        const char *CodeCompletionPtr = nullptr,
                                        DiagnosticEngine *Diags = nullptr) {
   const char *StartPtr = CurPtr - 1;
   assert(CurPtr[-1] == '/' && CurPtr[0] == '*' && "Not a /* comment");
   // Make sure to advance over the * so that we don't incorrectly handle /*/ as
   // the beginning and end of the comment.
   ++CurPtr;

   // /**/ comments can be nested, keep track of how deep we've gone.
   unsigned Depth = 1;
   bool isMultiline = false;

   while (1) {
      switch (*CurPtr++) {
         case '*':
            // Check for a '*/'
            if (*CurPtr == '/') {
               ++CurPtr;
               if (--Depth == 0)
                  return isMultiline;
            }
            break;
         case '/':
            // Check for a '/*'
            if (*CurPtr == '*') {
               ++CurPtr;
               ++Depth;
            }
            break;

         case '\n':
         case '\r':
            isMultiline = true;
            break;

         default:
            // If this is a "high" UTF-8 character, validate it.
            if (Diags && (signed char) (CurPtr[-1]) < 0) {
               --CurPtr;
               const char *CharStart = CurPtr;
               if (validateUTF8CharacterAndAdvance(CurPtr, BufferEnd) == ~0U)
                  Diags->diagnose(Lexer::getSourceLoc(CharStart),
                                  diag::lex_invalid_utf8);
            }

            break;   // Otherwise, eat other characters.
         case 0:
            if (CurPtr - 1 != BufferEnd) {
               if (Diags && CurPtr - 1 != CodeCompletionPtr) {
                  // If this is a random nul character in the middle of a buffer, skip
                  // it as whitespace.
                  diagnoseEmbeddedNul(Diags, CurPtr - 1);
               }
               continue;
            }
            // Otherwise, we have an unterminated /* comment.
            --CurPtr;

            if (Diags) {
               // Count how many levels deep we are.
               llvm::SmallString<8> Terminator("*/");
               while (--Depth != 0)
                  Terminator += "*/";
               const char *EOL = (CurPtr[-1] == '\n') ? (CurPtr - 1) : CurPtr;
               Diags
                  ->diagnose(Lexer::getSourceLoc(EOL),
                             diag::lex_unterminated_block_comment)
                  .fixItInsert(Lexer::getSourceLoc(EOL), Terminator);
               Diags->diagnose(Lexer::getSourceLoc(StartPtr), diag::lex_comment_start);
            }
            return isMultiline;
      }
   }
}

/// skipSlashStarComment - /**/ comments are skipped (treated as whitespace).
/// Note that (unlike in C) block comments can be nested.
void Lexer::skipSlashStarComment() {
   bool isMultiline =
      skipToEndOfSlashStarComment(CurPtr, BufferEnd, CodeCompletionPtr, Diags);
   if (isMultiline)
      NextToken.setAtStartOfLine(true);
}

static bool isValidIdentifierContinuationCodePoint(uint32_t c) {
   if (c < 0x80)
      return clang::isIdentifierBody(c, /*dollar*/true);

   // N1518: Recommendations for extended identifier characters for C and C++
   // Proposed Annex X.1: Ranges of characters allowed
   return c == 0x00A8 || c == 0x00AA || c == 0x00AD || c == 0x00AF
          || (c >= 0x00B2 && c <= 0x00B5) || (c >= 0x00B7 && c <= 0x00BA)
          || (c >= 0x00BC && c <= 0x00BE) || (c >= 0x00C0 && c <= 0x00D6)
          || (c >= 0x00D8 && c <= 0x00F6) || (c >= 0x00F8 && c <= 0x00FF)

          || (c >= 0x0100 && c <= 0x167F)
          || (c >= 0x1681 && c <= 0x180D)
          || (c >= 0x180F && c <= 0x1FFF)

          || (c >= 0x200B && c <= 0x200D)
          || (c >= 0x202A && c <= 0x202E)
          || (c >= 0x203F && c <= 0x2040)
          || c == 0x2054
          || (c >= 0x2060 && c <= 0x206F)

          || (c >= 0x2070 && c <= 0x218F)
          || (c >= 0x2460 && c <= 0x24FF)
          || (c >= 0x2776 && c <= 0x2793)
          || (c >= 0x2C00 && c <= 0x2DFF)
          || (c >= 0x2E80 && c <= 0x2FFF)

          || (c >= 0x3004 && c <= 0x3007)
          || (c >= 0x3021 && c <= 0x302F)
          || (c >= 0x3031 && c <= 0x303F)

          || (c >= 0x3040 && c <= 0xD7FF)

          || (c >= 0xF900 && c <= 0xFD3D)
          || (c >= 0xFD40 && c <= 0xFDCF)
          || (c >= 0xFDF0 && c <= 0xFE44)
          || (c >= 0xFE47 && c <= 0xFFF8)

          || (c >= 0x10000 && c <= 0x1FFFD)
          || (c >= 0x20000 && c <= 0x2FFFD)
          || (c >= 0x30000 && c <= 0x3FFFD)
          || (c >= 0x40000 && c <= 0x4FFFD)
          || (c >= 0x50000 && c <= 0x5FFFD)
          || (c >= 0x60000 && c <= 0x6FFFD)
          || (c >= 0x70000 && c <= 0x7FFFD)
          || (c >= 0x80000 && c <= 0x8FFFD)
          || (c >= 0x90000 && c <= 0x9FFFD)
          || (c >= 0xA0000 && c <= 0xAFFFD)
          || (c >= 0xB0000 && c <= 0xBFFFD)
          || (c >= 0xC0000 && c <= 0xCFFFD)
          || (c >= 0xD0000 && c <= 0xDFFFD)
          || (c >= 0xE0000 && c <= 0xEFFFD);
}

static bool isValidIdentifierStartCodePoint(uint32_t c) {
   if (!isValidIdentifierContinuationCodePoint(c))
      return false;
   if (c < 0x80 && (isDigit(c) || c == '$'))
      return false;

   // N1518: Recommendations for extended identifier characters for C and C++
   // Proposed Annex X.2: Ranges of characters disallowed initially
   if ((c >= 0x0300 && c <= 0x036F) ||
       (c >= 0x1DC0 && c <= 0x1DFF) ||
       (c >= 0x20D0 && c <= 0x20FF) ||
       (c >= 0xFE20 && c <= 0xFE2F))
      return false;

   return true;
}

static bool advanceIf(char const *&ptr, char const *end,
                      bool (*predicate)(uint32_t)) {
   char const *next = ptr;
   uint32_t c = validateUTF8CharacterAndAdvance(next, end);
   if (c == ~0U)
      return false;
   if (predicate(c)) {
      ptr = next;
      return true;
   }
   return false;

}

static bool advanceIfValidStartOfIdentifier(char const *&ptr,
                                            char const *end) {
   return advanceIf(ptr, end, isValidIdentifierStartCodePoint);
}

static bool advanceIfValidContinuationOfIdentifier(char const *&ptr,
                                                   char const *end) {
   return advanceIf(ptr, end, isValidIdentifierContinuationCodePoint);
}

static bool advanceIfValidStartOfOperator(char const *&ptr,
                                          char const *end) {
   return advanceIf(ptr, end, Identifier::isOperatorStartCodePoint);
}

static bool advanceIfValidContinuationOfOperator(char const *&ptr,
                                                 char const *end) {
   return advanceIf(ptr, end, Identifier::isOperatorContinuationCodePoint);
}

bool Lexer::isIdentifier(StringRef string) {
   if (string.empty()) return false;
   char const *p = string.data(), *end = string.end();
   if (!advanceIfValidStartOfIdentifier(p, end))
      return false;
   while (p < end && advanceIfValidContinuationOfIdentifier(p, end));
   return p == end;
}

/// Determines if the given string is a valid operator identifier,
/// without escaping characters.
bool Lexer::isOperator(StringRef string) {
   if (string.empty()) return false;
   char const *p = string.data(), *end = string.end();
   if (!advanceIfValidStartOfOperator(p, end))
      return false;
   while (p < end && advanceIfValidContinuationOfOperator(p, end));
   return p == end;
}


tok Lexer::kindOfIdentifier(StringRef Str, bool InPILMode) {
#define PIL_KEYWORD(kw)
#define KEYWORD(kw) if (Str == #kw) return tok::kw_##kw;

#include "polarphp/llparser/TokenKindsDef.h"

   // PIL keywords are only active in PIL mode.
   if (InPILMode) {
#define PIL_KEYWORD(kw) if (Str == #kw) return tok::kw_##kw;

#include "polarphp/llparser/TokenKindsDef.h"
   }
   return tok::identifier;
}

/// lexIdentifier - Match [a-zA-Z_][a-zA-Z_$0-9]*
void Lexer::lexIdentifier() {
   const char *TokStart = CurPtr - 1;
   CurPtr = TokStart;
   bool didStart = advanceIfValidStartOfIdentifier(CurPtr, BufferEnd);
   assert(didStart && "Unexpected start");
   (void) didStart;

   // Lex [a-zA-Z_$0-9[[:XID_Continue:]]]*
   while (advanceIfValidContinuationOfIdentifier(CurPtr, BufferEnd));

   tok Kind = kindOfIdentifier(StringRef(TokStart, CurPtr - TokStart),
                               LexMode == LexerMode::PIL);
   return formToken(Kind, TokStart);
}

/// lexHash - Handle #], #! for shebangs, and the family of #identifiers.
void Lexer::lexHash() {
   const char *TokStart = CurPtr - 1;

   // Scan for [a-zA-Z]+ to see what we match.
   const char *tmpPtr = CurPtr;
   if (clang::isIdentifierHead(*tmpPtr)) {
      do {
         ++tmpPtr;
      } while (clang::isIdentifierBody(*tmpPtr));
   }

   // Map the character sequence onto
   tok Kind = llvm::StringSwitch<tok>(StringRef(CurPtr, tmpPtr - CurPtr))
#define POUND_KEYWORD(id) \
  .Case(#id, tok::pound_##id)

#include "polarphp/llparser/TokenKindsDef.h"

      .Default(tok::pound);

   // If we didn't find a match, then just return tok::pound.  This is highly
   // dubious in terms of error recovery, but is useful for code completion and
   // PIL parsing.
   if (Kind == tok::pound)
      return formToken(tok::pound, TokStart);

   // If we found something specific, return it.
   CurPtr = tmpPtr;
   return formToken(Kind, TokStart);
}


/// Is the operator beginning at the given character "left-bound"?
static bool isLeftBound(const char *tokBegin, const char *bufferBegin) {
   // The first character in the file is not left-bound.
   if (tokBegin == bufferBegin) return false;

   switch (tokBegin[-1]) {
      case ' ':
      case '\r':
      case '\n':
      case '\t': // whitespace
      case '(':
      case '[':
      case '{':              // opening delimiters
      case ',':
      case ';':
      case ':':              // expression separators
      case '\0':                                 // whitespace / last char in file
         return false;

      case '/':
         if (tokBegin - 1 != bufferBegin && tokBegin[-2] == '*')
            return false; // End of a slash-star comment, so whitespace.
         else
            return true;

      case '\xA0':
         if (tokBegin - 1 != bufferBegin && tokBegin[-2] == '\xC2')
            return false; // Non-breaking whitespace (U+00A0)
         else
            return true;

      default:
         return true;
   }
}

/// Is the operator ending at the given character (actually one past the end)
/// "right-bound"?
///
/// The code-completion point is considered right-bound.
static bool isRightBound(const char *tokEnd, bool isLeftBound,
                         const char *codeCompletionPtr) {
   switch (*tokEnd) {
      case ' ':
      case '\r':
      case '\n':
      case '\t': // whitespace
      case ')':
      case ']':
      case '}':              // closing delimiters
      case ',':
      case ';':
      case ':':              // expression separators
         return false;

      case '\0':
         if (tokEnd == codeCompletionPtr)         // code-completion
            return true;
         return false;                            // whitespace / last char in file

      case '.':
         // Prefer the '^' in "x^.y" to be a postfix op, not binary, but the '^' in
         // "^.y" to be a prefix op, not binary.
         return !isLeftBound;

      case '/':
         // A following comment counts as whitespace, so this token is not right bound.
         if (tokEnd[1] == '/' || tokEnd[1] == '*')
            return false;
         else
            return true;

      case '\xC2':
         if (tokEnd[1] == '\xA0')
            return false; // Non-breaking whitespace (U+00A0)
         else
            return true;

      default:
         return true;
   }
}

static bool rangeContainsPlaceholderEnd(const char *CurPtr,
                                        const char *End) {
   for (auto SubStr = CurPtr; SubStr != End - 1; ++SubStr) {
      if (SubStr[0] == '\n') {
         return false;
      }
      if (SubStr[0] == '#' && SubStr[1] == '>') {
         return true;
      }
   }
   return false;
}

/// lexOperatorIdentifier - Match identifiers formed out of punctuation.
void Lexer::lexOperatorIdentifier() {
   const char *TokStart = CurPtr - 1;
   CurPtr = TokStart;
   bool didStart = advanceIfValidStartOfOperator(CurPtr, BufferEnd);
   assert(didStart && "unexpected operator start");
   (void) didStart;

   do {
      if (CurPtr != BufferEnd && InPILBody &&
          (*CurPtr == '!' || *CurPtr == '?'))
         // When parsing PIL body, '!' and '?' are special token and can't be
         // in the middle of an operator.
         break;

      // '.' cannot appear in the middle of an operator unless the operator
      // started with a '.'.
      if (*CurPtr == '.' && *TokStart != '.')
         break;
      if (Identifier::isEditorPlaceholder(StringRef(CurPtr, BufferEnd - CurPtr)) &&
          rangeContainsPlaceholderEnd(CurPtr + 2, BufferEnd)) {
         break;
      }
   } while (advanceIfValidContinuationOfOperator(CurPtr, BufferEnd));

   if (CurPtr - TokStart > 2) {
      // If there is a "//" or "/*" in the middle of an identifier token,
      // it starts a comment.
      for (auto Ptr = TokStart + 1; Ptr != CurPtr - 1; ++Ptr) {
         if (Ptr[0] == '/' && (Ptr[1] == '/' || Ptr[1] == '*')) {
            CurPtr = Ptr;
            break;
         }
      }
   }

   // Decide between the binary, prefix, and postfix cases.
   // It's binary if either both sides are bound or both sides are not bound.
   // Otherwise, it's postfix if left-bound and prefix if right-bound.
   bool leftBound = isLeftBound(TokStart, ContentStart);
   bool rightBound = isRightBound(CurPtr, leftBound, CodeCompletionPtr);

   // Match various reserved words.
   if (CurPtr - TokStart == 1) {
      switch (TokStart[0]) {
         case '=':
            if (leftBound != rightBound) {
               auto d = diagnose(TokStart, diag::lex_unary_equal);
               if (leftBound)
                  d.fixItInsert(getSourceLoc(TokStart), " ");
               else
                  d.fixItInsert(getSourceLoc(TokStart + 1), " ");
            }
            // always emit 'tok::equal' to avoid trickle down parse errors
            return formToken(tok::equal, TokStart);
         case '&':
            if (leftBound == rightBound || leftBound)
               break;
            return formToken(tok::amp_prefix, TokStart);
         case '.': {
            if (leftBound == rightBound)
               return formToken(tok::period, TokStart);
            if (rightBound)
               return formToken(tok::period_prefix, TokStart);

            // If left bound but not right bound, handle some likely situations.

            // If there is just some horizontal whitespace before the next token, its
            // addition is probably incorrect.
            const char *AfterHorzWhitespace = CurPtr;
            while (*AfterHorzWhitespace == ' ' || *AfterHorzWhitespace == '\t')
               ++AfterHorzWhitespace;

            // First, when we are code completing "x. <ESC>", then make sure to return
            // a tok::period, since that is what the user is wanting to know about.
            if (*AfterHorzWhitespace == '\0' &&
                AfterHorzWhitespace == CodeCompletionPtr) {
               diagnose(TokStart, diag::expected_member_name);
               return formToken(tok::period, TokStart);
            }

            if (isRightBound(AfterHorzWhitespace, leftBound, CodeCompletionPtr) &&
                // Don't consider comments to be this.  A leading slash is probably
                // either // or /* and most likely occurs just in our testsuite for
                // expected-error lines.
                *AfterHorzWhitespace != '/') {
               diagnose(TokStart, diag::extra_whitespace_period)
                  .fixItRemoveChars(getSourceLoc(CurPtr),
                                    getSourceLoc(AfterHorzWhitespace));
               return formToken(tok::period, TokStart);
            }

            // Otherwise, it is probably a missing member.
            diagnose(TokStart, diag::expected_member_name);
            return formToken(tok::unknown, TokStart);
         }
         case '?':
            if (leftBound)
               return formToken(tok::question_postfix, TokStart);
            return formToken(tok::question_infix, TokStart);
      }
   } else if (CurPtr - TokStart == 2) {
      switch ((TokStart[0] << 8) | TokStart[1]) {
         case ('-' << 8) | '>': // ->
            return formToken(tok::arrow, TokStart);
         case ('*' << 8) | '/': // */
            diagnose(TokStart, diag::lex_unexpected_block_comment_end);
            return formToken(tok::unknown, TokStart);
      }
   } else {
      // Verify there is no "*/" in the middle of the identifier token, we reject
      // it as potentially ending a block comment.
      auto Pos = StringRef(TokStart, CurPtr - TokStart).find("*/");
      if (Pos != StringRef::npos) {
         diagnose(TokStart + Pos, diag::lex_unexpected_block_comment_end);
         return formToken(tok::unknown, TokStart);
      }
   }

   if (leftBound == rightBound)
      return formToken(leftBound ? tok::oper_binary_unspaced :
                       tok::oper_binary_spaced, TokStart);

   return formToken(leftBound ? tok::oper_postfix : tok::oper_prefix, TokStart);
}

/// lexDollarIdent - Match $[0-9a-zA-Z_$]+
void Lexer::lexDollarIdent() {
   const char *tokStart = CurPtr - 1;
   assert(*tokStart == '$');

   // In a PIL function body, '$' is a token by itself, except it's a PIL global
   // name. PIL global identifiers may start with a '$', e.g. @$S1m3fooyyF.
   if (InPILBody && NextToken.getKind() != tok::at_sign)
      return formToken(tok::pil_dollar, tokStart);

   bool isAllDigits = true;
   for (;; ++CurPtr) {
      if (isDigit(*CurPtr)) {
         // continue
      } else if (clang::isIdentifierHead(*CurPtr, /*dollar*/true)) {
         isAllDigits = false;
         // continue
      } else {
         break;
      }
   }

   if (CurPtr == tokStart + 1) {
      // It is an error to see a standalone '$'. Offer to replace '$' with '`$`'.
      diagnose(tokStart, diag::standalone_dollar_identifier)
         .fixItReplaceChars(getSourceLoc(tokStart), getSourceLoc(CurPtr), "`$`");
      return formToken(tok::identifier, tokStart);
   }

   if (!isAllDigits) {
      return formToken(tok::identifier, tokStart);
   } else {
      return formToken(tok::dollarident, tokStart);
   }
}

enum class ExpectedDigitKind : unsigned {
   Binary, Octal, Decimal, Hex
};

void Lexer::lexHexNumber() {
   // We assume we're starting from the 'x' in a '0x...' floating-point literal.
   assert(*CurPtr == 'x' && "not a hex literal");
   const char *TokStart = CurPtr - 1;
   assert(*TokStart == '0' && "not a hex literal");

   auto expected_digit = [&]() {
      while (advanceIfValidContinuationOfIdentifier(CurPtr, BufferEnd));
      return formToken(tok::unknown, TokStart);
   };

   auto expected_hex_digit = [&](const char *loc) {
      diagnose(loc, diag::lex_invalid_digit_in_int_literal, StringRef(loc, 1),
               (unsigned) ExpectedDigitKind::Hex);
      return expected_digit();
   };

   // 0x[0-9a-fA-F][0-9a-fA-F_]*
   ++CurPtr;
   if (!isHexDigit(*CurPtr))
      return expected_hex_digit(CurPtr);

   while (isHexDigit(*CurPtr) || *CurPtr == '_')
      ++CurPtr;

   if (*CurPtr != '.' && *CurPtr != 'p' && *CurPtr != 'P') {
      auto tmp = CurPtr;
      if (advanceIfValidContinuationOfIdentifier(CurPtr, BufferEnd))
         return expected_hex_digit(tmp);
      else
         return formToken(tok::integer_literal, TokStart);
   }

   const char *PtrOnDot = nullptr;

   // (\.[0-9A-Fa-f][0-9A-Fa-f_]*)?
   if (*CurPtr == '.') {
      PtrOnDot = CurPtr;
      ++CurPtr;

      // If the character after the '.' is not a digit, assume we have an int
      // literal followed by a dot expression.
      if (!isHexDigit(*CurPtr)) {
         --CurPtr;
         return formToken(tok::integer_literal, TokStart);
      }

      while (isHexDigit(*CurPtr) || *CurPtr == '_')
         ++CurPtr;

      if (*CurPtr != 'p' && *CurPtr != 'P') {
         if (!isDigit(PtrOnDot[1])) {
            // e.g: 0xff.description
            CurPtr = PtrOnDot;
            return formToken(tok::integer_literal, TokStart);
         }
         diagnose(CurPtr, diag::lex_expected_binary_exponent_in_hex_float_literal);
         return formToken(tok::unknown, TokStart);
      }
   }

   // [pP][+-]?[0-9][0-9_]*
   assert(*CurPtr == 'p' || *CurPtr == 'P' && "not at a hex float exponent?!");
   ++CurPtr;

   bool signedExponent = false;
   if (*CurPtr == '+' || *CurPtr == '-') {
      ++CurPtr;  // Eat the sign.
      signedExponent = true;
   }

   if (!isDigit(*CurPtr)) {
      if (PtrOnDot && !isDigit(PtrOnDot[1]) && !signedExponent) {
         // e.g: 0xff.fpValue, 0xff.fp
         CurPtr = PtrOnDot;
         return formToken(tok::integer_literal, TokStart);
      }
      // Note: 0xff.fp+otherExpr can be valid expression. But we don't accept it.

      // There are 3 cases to diagnose if the exponent starts with a non-digit:
      // identifier (invalid character), underscore (invalid first character),
      // non-identifier (empty exponent)
      auto tmp = CurPtr;
      if (advanceIfValidContinuationOfIdentifier(CurPtr, BufferEnd))
         diagnose(tmp, diag::lex_invalid_digit_in_fp_exponent, StringRef(tmp, 1),
                  *tmp == '_');
      else
         diagnose(CurPtr, diag::lex_expected_digit_in_fp_exponent);

      return expected_digit();
   }

   while (isDigit(*CurPtr) || *CurPtr == '_')
      ++CurPtr;

   auto tmp = CurPtr;
   if (advanceIfValidContinuationOfIdentifier(CurPtr, BufferEnd)) {
      diagnose(tmp, diag::lex_invalid_digit_in_fp_exponent, StringRef(tmp, 1),
               false);
      return expected_digit();
   }

   return formToken(tok::floating_literal, TokStart);
}

/// lexNumber:
///   integer_literal  ::= [0-9][0-9_]*
///   integer_literal  ::= 0x[0-9a-fA-F][0-9a-fA-F_]*
///   integer_literal  ::= 0o[0-7][0-7_]*
///   integer_literal  ::= 0b[01][01_]*
///   floating_literal ::= [0-9][0-9]_*\.[0-9][0-9_]*
///   floating_literal ::= [0-9][0-9]*\.[0-9][0-9_]*[eE][+-]?[0-9][0-9_]*
///   floating_literal ::= [0-9][0-9_]*[eE][+-]?[0-9][0-9_]*
///   floating_literal ::= 0x[0-9A-Fa-f][0-9A-Fa-f_]*
///                          (\.[0-9A-Fa-f][0-9A-Fa-f_]*)?[pP][+-]?[0-9][0-9_]*
void Lexer::lexNumber() {
   const char *TokStart = CurPtr - 1;
   assert((isDigit(*TokStart) || *TokStart == '.') && "Unexpected start");

   auto expected_digit = [&]() {
      while (advanceIfValidContinuationOfIdentifier(CurPtr, BufferEnd));
      return formToken(tok::unknown, TokStart);
   };

   auto expected_int_digit = [&](const char *loc, ExpectedDigitKind kind) {
      diagnose(loc, diag::lex_invalid_digit_in_int_literal, StringRef(loc, 1),
               (unsigned) kind);
      return expected_digit();
   };

   if (*TokStart == '0' && *CurPtr == 'x')
      return lexHexNumber();

   if (*TokStart == '0' && *CurPtr == 'o') {
      // 0o[0-7][0-7_]*
      ++CurPtr;
      if (*CurPtr < '0' || *CurPtr > '7')
         return expected_int_digit(CurPtr, ExpectedDigitKind::Octal);

      while ((*CurPtr >= '0' && *CurPtr <= '7') || *CurPtr == '_')
         ++CurPtr;

      auto tmp = CurPtr;
      if (advanceIfValidContinuationOfIdentifier(CurPtr, BufferEnd))
         return expected_int_digit(tmp, ExpectedDigitKind::Octal);

      return formToken(tok::integer_literal, TokStart);
   }

   if (*TokStart == '0' && *CurPtr == 'b') {
      // 0b[01][01_]*
      ++CurPtr;
      if (*CurPtr != '0' && *CurPtr != '1')
         return expected_int_digit(CurPtr, ExpectedDigitKind::Binary);

      while (*CurPtr == '0' || *CurPtr == '1' || *CurPtr == '_')
         ++CurPtr;

      auto tmp = CurPtr;
      if (advanceIfValidContinuationOfIdentifier(CurPtr, BufferEnd))
         return expected_int_digit(tmp, ExpectedDigitKind::Binary);

      return formToken(tok::integer_literal, TokStart);
   }

   // Handle a leading [0-9]+, lexing an integer or falling through if we have a
   // floating point value.
   while (isDigit(*CurPtr) || *CurPtr == '_')
      ++CurPtr;

   // Lex things like 4.x as '4' followed by a tok::period.
   if (*CurPtr == '.') {
      // NextToken is the soon to be previous token
      // Therefore: x.0.1 is sub-tuple access, not x.float_literal
      if (!isDigit(CurPtr[1]) || NextToken.is(tok::period))
         return formToken(tok::integer_literal, TokStart);
   } else {
      // Floating literals must have '.', 'e', or 'E' after digits.  If it is
      // something else, then this is the end of the token.
      if (*CurPtr != 'e' && *CurPtr != 'E') {
         auto tmp = CurPtr;
         if (advanceIfValidContinuationOfIdentifier(CurPtr, BufferEnd))
            return expected_int_digit(tmp, ExpectedDigitKind::Decimal);

         return formToken(tok::integer_literal, TokStart);
      }
   }

   // Lex decimal point.
   if (*CurPtr == '.') {
      ++CurPtr;

      // Lex any digits after the decimal point.
      while (isDigit(*CurPtr) || *CurPtr == '_')
         ++CurPtr;
   }

   // Lex exponent.
   if (*CurPtr == 'e' || *CurPtr == 'E') {
      ++CurPtr;  // Eat the 'e'
      if (*CurPtr == '+' || *CurPtr == '-')
         ++CurPtr;  // Eat the sign.

      if (!isDigit(*CurPtr)) {
         // There are 3 cases to diagnose if the exponent starts with a non-digit:
         // identifier (invalid character), underscore (invalid first character),
         // non-identifier (empty exponent)
         auto tmp = CurPtr;
         if (advanceIfValidContinuationOfIdentifier(CurPtr, BufferEnd))
            diagnose(tmp, diag::lex_invalid_digit_in_fp_exponent, StringRef(tmp, 1),
                     *tmp == '_');
         else
            diagnose(CurPtr, diag::lex_expected_digit_in_fp_exponent);

         return expected_digit();
      }

      while (isDigit(*CurPtr) || *CurPtr == '_')
         ++CurPtr;

      auto tmp = CurPtr;
      if (advanceIfValidContinuationOfIdentifier(CurPtr, BufferEnd)) {
         diagnose(tmp, diag::lex_invalid_digit_in_fp_exponent, StringRef(tmp, 1),
                  false);
         return expected_digit();
      }
   }

   return formToken(tok::floating_literal, TokStart);
}

///   unicode_character_escape ::= [\]u{hex+}
///   hex                      ::= [0-9a-fA-F]
unsigned Lexer::lexUnicodeEscape(const char *&CurPtr, Lexer *Diags) {
   assert(CurPtr[0] == '{' && "Invalid unicode escape");
   ++CurPtr;

   const char *DigitStart = CurPtr;

   unsigned NumDigits = 0;
   for (; isHexDigit(CurPtr[0]); ++NumDigits)
      ++CurPtr;

   if (CurPtr[0] != '}') {
      if (Diags)
         Diags->diagnose(CurPtr, diag::lex_invalid_u_escape_rbrace);
      return ~1U;
   }
   ++CurPtr;

   if (NumDigits < 1 || NumDigits > 8) {
      if (Diags)
         Diags->diagnose(CurPtr, diag::lex_invalid_u_escape);
      return ~1U;
   }

   unsigned CharValue = 0;
   StringRef(DigitStart, NumDigits).getAsInteger(16, CharValue);
   return CharValue;
}

/// maybeConsumeNewlineEscape - Check for valid elided newline escape and
/// move pointer passed in to the character after the end of the line.
static bool maybeConsumeNewlineEscape(const char *&CurPtr, ssize_t Offset) {
   const char *TmpPtr = CurPtr + Offset;
   while (true) {
      switch (*TmpPtr++) {
         case ' ':
         case '\t':
            continue;
         case '\r':
            if (*TmpPtr == '\n')
               ++TmpPtr;
            LLVM_FALLTHROUGH;
         case '\n':
            CurPtr = TmpPtr;
            return true;
         case 0:
         default:
            return false;
      }
   }
}

/// diagnoseZeroWidthMatchAndAdvance - Error invisible characters in delimiters.
/// An invisible character in the middle of a delimiter can be used to extend
/// the literal beyond what it would appear creating potential security bugs.
static bool diagnoseZeroWidthMatchAndAdvance(char Target, const char *&CurPtr,
                                             DiagnosticEngine *Diags) {
   // TODO: Detect, diagnose and skip over zero-width characters if required.
   // See https://bugs.swift.org/browse/SR-8678 for possible implementation.
   return *CurPtr == Target && CurPtr++;
}

/// advanceIfMultilineDelimiter - Centralized check for multiline delimiter.
static bool advanceIfMultilineDelimiter(const char *&CurPtr,
                                        DiagnosticEngine *Diags) {
   const char *TmpPtr = CurPtr;
   if (*(TmpPtr - 1) == '"' &&
       diagnoseZeroWidthMatchAndAdvance('"', TmpPtr, Diags) &&
       diagnoseZeroWidthMatchAndAdvance('"', TmpPtr, Diags)) {
      CurPtr = TmpPtr;
      return true;
   }
   return false;
}

/// advanceIfCustomDelimiter - Extracts/detects any custom delimiter on
/// opening a string literal, advances CurPtr if a delimiter is found and
/// returns a non-zero delimiter length. CurPtr[-1] must be '#' when called.
static unsigned advanceIfCustomDelimiter(const char *&CurPtr,
                                         DiagnosticEngine *Diags) {
   assert(CurPtr[-1] == '#');

   const char *TmpPtr = CurPtr;
   unsigned CustomDelimiterLen = 1;
   while (diagnoseZeroWidthMatchAndAdvance('#', TmpPtr, Diags))
      CustomDelimiterLen++;
   if (diagnoseZeroWidthMatchAndAdvance('"', TmpPtr, Diags)) {
      CurPtr = TmpPtr;
      return CustomDelimiterLen;
   }
   return 0;
}

/// delimiterMatches - Does custom delimiter ('#' characters surrounding quotes)
/// match the number of '#' characters after '\' inside the string? This allows
/// interpolation inside a "raw" string. Normal/cooked string processing is
/// the degenerate case of there being no '#' characters surrounding the quotes.
/// If delimiter matches, advances byte pointer passed in and returns true.
/// Also used to detect the final delimiter of a string when IsClosing == true.
static bool delimiterMatches(unsigned CustomDelimiterLen, const char *&BytesPtr,
                             DiagnosticEngine *Diags, bool IsClosing = false) {
   if (!CustomDelimiterLen)
      return true;
   const char *TmpPtr = BytesPtr;
   while (diagnoseZeroWidthMatchAndAdvance('#', TmpPtr, Diags)) {}

   if (TmpPtr - BytesPtr < CustomDelimiterLen)
      return false;

   BytesPtr += CustomDelimiterLen;

   if (Diags && TmpPtr > BytesPtr) {
      Diag<> message = IsClosing ? diag::lex_invalid_closing_delimiter
                                 : diag::lex_invalid_escape_delimiter;
      Diags->diagnose(Lexer::getSourceLoc(BytesPtr), message)
         .fixItRemoveChars(Lexer::getSourceLoc(BytesPtr),
                           Lexer::getSourceLoc(TmpPtr));
   }
   return true;
}

/// lexCharacter - Read a character and return its UTF32 code.  If this is the
/// end of enclosing string/character sequence (i.e. the character is equal to
/// 'StopQuote'), this returns ~0U and advances 'CurPtr' pointing to the end of
/// terminal quote.  If this is a malformed character sequence, it emits a
/// diagnostic (when EmitDiagnostics is true) and returns ~1U.
///
///   character_escape  ::= [\][\] | [\]t | [\]n | [\]r | [\]" | [\]' | [\]0
///   character_escape  ::= unicode_character_escape
unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote,
                             bool EmitDiagnostics, bool IsMultilineString,
                             unsigned CustomDelimiterLen) {
   const char *CharStart = CurPtr;

   switch (*CurPtr++) {
      default: {// Normal characters are part of the string.
         // Normal characters are part of the string.
         // If this is a "high" UTF-8 character, validate it.
         if ((signed char) (CurPtr[-1]) >= 0) {
            if (isPrintable(CurPtr[-1]) == 0)
               if (!(IsMultilineString && (CurPtr[-1] == '\t')))
                  if (EmitDiagnostics)
                     diagnose(CharStart, diag::lex_unprintable_ascii_character);
            return CurPtr[-1];
         }
         --CurPtr;
         unsigned CharValue = validateUTF8CharacterAndAdvance(CurPtr, BufferEnd);
         if (CharValue != ~0U) return CharValue;
         if (EmitDiagnostics)
            diagnose(CharStart, diag::lex_invalid_utf8);
         return ~1U;
      }
      case '"':
      case '\'':
         if (CurPtr[-1] == StopQuote) {
            // Mutliline and custom escaping are only enabled for " quote.
            if (LLVM_UNLIKELY(StopQuote != '"'))
               return ~0U;
            if (!IsMultilineString && !CustomDelimiterLen)
               return ~0U;

            DiagnosticEngine *D = EmitDiagnostics ? Diags : nullptr;
            auto TmpPtr = CurPtr;
            if (IsMultilineString && !advanceIfMultilineDelimiter(TmpPtr, D))
               return '"';
            if (CustomDelimiterLen &&
                !delimiterMatches(CustomDelimiterLen, TmpPtr, D, /*IsClosing=*/true))
               return '"';
            CurPtr = TmpPtr;
            return ~0U;
         }
         // Otherwise, this is just a character.
         return CurPtr[-1];

      case 0:
         assert(CurPtr - 1 != BufferEnd && "Caller must handle EOF");
         if (EmitDiagnostics)
            diagnose(CurPtr - 1, diag::lex_nul_character);
         return CurPtr[-1];
      case '\n':  // String literals cannot have \n or \r in them.
      case '\r':
         assert(IsMultilineString && "Caller must handle newlines in non-multiline");
         return CurPtr[-1];
      case '\\':  // Escapes.
         if (!delimiterMatches(CustomDelimiterLen, CurPtr,
                               EmitDiagnostics ? Diags : nullptr))
            return '\\';
         break;
   }

   unsigned CharValue = 0;
   // Escape processing.  We already ate the "\".
   switch (*CurPtr) {
      case ' ':
      case '\t':
      case '\n':
      case '\r':
         if (IsMultilineString && maybeConsumeNewlineEscape(CurPtr, 0))
            return '\n';
         LLVM_FALLTHROUGH;
      default:  // Invalid escape.
         if (EmitDiagnostics)
            diagnose(CurPtr, diag::lex_invalid_escape);
         // If this looks like a plausible escape character, recover as though this
         // is an invalid escape.
         if (isAlphanumeric(*CurPtr)) ++CurPtr;
         return ~1U;

         // Simple single-character escapes.
      case '0':
         ++CurPtr;
         return '\0';
      case 'n':
         ++CurPtr;
         return '\n';
      case 'r':
         ++CurPtr;
         return '\r';
      case 't':
         ++CurPtr;
         return '\t';
      case '"':
         ++CurPtr;
         return '"';
      case '\'':
         ++CurPtr;
         return '\'';
      case '\\':
         ++CurPtr;
         return '\\';

      case 'u': {  //  \u HEX HEX HEX HEX
         ++CurPtr;
         if (*CurPtr != '{') {
            if (EmitDiagnostics)
               diagnose(CurPtr - 1, diag::lex_unicode_escape_braces);
            return ~1U;
         }

         CharValue = lexUnicodeEscape(CurPtr, EmitDiagnostics ? this : nullptr);
         if (CharValue == ~1U) return ~1U;
         break;
      }
   }

   // Check to see if the encoding is valid.
   llvm::SmallString<64> TempString;
   if (CharValue >= 0x80 && EncodeToUTF8(CharValue, TempString)) {
      if (EmitDiagnostics)
         diagnose(CharStart, diag::lex_invalid_unicode_scalar);
      return ~1U;
   }

   return CharValue;
}

/// skipToEndOfInterpolatedExpression - Given the first character after a \(
/// sequence in a string literal (the start of an interpolated expression),
/// scan forward to the end of the interpolated expression and return the end.
/// On success, the returned pointer will point to the ')' at the end of the
/// interpolated expression.  On failure, it will point to the first character
/// that cannot be lexed as part of the interpolated expression; this character
/// will never be ')'.
///
/// This function performs brace and quote matching, keeping a stack of
/// outstanding delimiters as it scans the string.
static const char *skipToEndOfInterpolatedExpression(const char *CurPtr,
                                                     const char *EndPtr,
                                                     bool IsMultilineString) {
   SmallVector<char, 4> OpenDelimiters;
   SmallVector<bool, 4> AllowNewline;
   SmallVector<unsigned, 4> CustomDelimiter;
   AllowNewline.push_back(IsMultilineString);

   auto inStringLiteral = [&]() {
      return !OpenDelimiters.empty() &&
             (OpenDelimiters.back() == '"' || OpenDelimiters.back() == '\'');
   };
   while (true) {
      // This is a simple scanner, capable of recognizing nested parentheses and
      // string literals but not much else.  The implications of this include not
      // being able to break an expression over multiple lines in an interpolated
      // string.  This limitation allows us to recover from common errors though.
      //
      // On success scanning the expression body, the real lexer will be used to
      // relex the body when parsing the expressions.  We let it diagnose any
      // issues with malformed tokens or other problems.
      unsigned CustomDelimiterLen = 0;
      switch (*CurPtr++) {
         // String literals in general cannot be split across multiple lines;
         // interpolated ones are no exception - unless multiline literals.
         case '\n':
         case '\r':
            if (AllowNewline.back())
               continue;
            // Will be diagnosed as an unterminated string literal.
            return CurPtr - 1;
         case 0:
            if (CurPtr - 1 != EndPtr)
               continue; // CC token or random NUL character.
            // Will be diagnosed as an unterminated string literal.
            return CurPtr - 1;

         case '#':
            if (inStringLiteral() ||
                !(CustomDelimiterLen = advanceIfCustomDelimiter(CurPtr, nullptr)))
               continue;
            assert(CurPtr[-1] == '"' &&
                   "advanceIfCustomDelimiter() must stop at after the quote");
            LLVM_FALLTHROUGH;

         case '"':
         case '\'': {
            if (!inStringLiteral()) {
               // Open string literal.
               OpenDelimiters.push_back(CurPtr[-1]);
               AllowNewline.push_back(advanceIfMultilineDelimiter(CurPtr, nullptr));
               CustomDelimiter.push_back(CustomDelimiterLen);
               continue;
            }

            // In string literal.

            // Skip if it's an another kind of quote in string literal. e.g. "foo's".
            if (OpenDelimiters.back() != CurPtr[-1])
               continue;

            // Multi-line string can only be closed by '"""'.
            if (AllowNewline.back() && !advanceIfMultilineDelimiter(CurPtr, nullptr))
               continue;

            // Check whether we have equivalent number of '#'s.
            if (!delimiterMatches(CustomDelimiter.back(), CurPtr, nullptr, true))
               continue;

            // Close string literal.
            OpenDelimiters.pop_back();
            AllowNewline.pop_back();
            CustomDelimiter.pop_back();
            continue;
         }
         case '\\':
            // We ignore invalid escape sequence here. They should be diagnosed in
            // the real lexer functions.
            if (inStringLiteral() &&
                delimiterMatches(CustomDelimiter.back(), CurPtr, nullptr)) {
               switch (*CurPtr++) {
                  case '(':
                     // Entering a recursive interpolated expression
                     OpenDelimiters.push_back('(');
                     continue;
                  case '\n':
                  case '\r':
                  case 0:
                     // Don't jump over newline/EOF due to preceding backslash.
                     // Let the outer switch to handle it.
                     --CurPtr;
                     continue;
                  default:
                     continue;
               }
            }
            continue;

            // Paren nesting deeper to support "foo = \((a+b)-(c*d)) bar".
         case '(':
            if (!inStringLiteral()) {
               OpenDelimiters.push_back('(');
            }
            continue;
         case ')':
            if (OpenDelimiters.empty()) {
               // No outstanding open delimiters; we're done.
               return CurPtr - 1;
            } else if (OpenDelimiters.back() == '(') {
               // Pop the matching bracket and keep going.
               OpenDelimiters.pop_back();
               continue;
            } else {
               // It's a right parenthesis in a string literal.
               assert(inStringLiteral());
               continue;
            }
         case '/':
            if (inStringLiteral())
               continue;

            if (*CurPtr == '*') {
               auto CommentStart = CurPtr - 1;
               bool isMultilineComment = skipToEndOfSlashStarComment(CurPtr, EndPtr);
               if (isMultilineComment && !AllowNewline.back()) {
                  // Multiline comment is prohibited in string literal.
                  // Return the start of the comment.
                  return CommentStart;
               }
            } else if (*CurPtr == '/') {
               if (!AllowNewline.back()) {
                  // '//' comment is impossible in single line string literal.
                  // Return the start of the comment.
                  return CurPtr - 1;
               }
               // Advance to the end of the comment.
               if (/*isEOL=*/advanceToEndOfLine(CurPtr, EndPtr))
                  ++CurPtr;
            }
            continue;
         default:
            // Normal token character.
            continue;
      }
   }
}

/// getStringLiteralContent:
/// Extract content of string literal from inside quotes.
static StringRef getStringLiteralContent(const Token &Str) {
   StringRef Bytes = Str.getText();

   if (unsigned CustomDelimiterLen = Str.getCustomDelimiterLen())
      Bytes = Bytes.drop_front(CustomDelimiterLen).drop_back(CustomDelimiterLen);

   if (Str.isMultilineString())
      Bytes = Bytes.drop_front(3).drop_back(3);
   else
      Bytes = Bytes.drop_front().drop_back();

   return Bytes;
}

static size_t commonPrefixLength(StringRef shorter, StringRef longer) {
   size_t offset = 0;
   while (offset < shorter.size() && offset < longer.size() && shorter[offset] == longer[offset]) {
      ++offset;
   }

   return offset;
}

/// getMultilineTrailingIndent:
/// Determine trailing indent to be used for multiline literal indent stripping.
StringRef
getMultilineTrailingIndent(StringRef Bytes, DiagnosticEngine *Diags = nullptr,
                           unsigned CustomDelimiterLen = 0) {
   const char *begin = Bytes.begin(), *end = Bytes.end(), *start = end;
   bool sawNonWhitespace = false;

   // Work back from the end to find whitespace to strip.
   while (!sawNonWhitespace && start > begin) {
      switch (*--start) {
         case ' ':
         case '\t':
            continue;
         case '\n':
         case '\r': {
            ++start;

            // Disallow escaped newline in the last line.
            if (Diags && !CustomDelimiterLen) {
               auto *Ptr = start - 1;
               if (*Ptr == '\n') --Ptr;
               if (*Ptr == '\r') --Ptr;
               auto *LineEnd = Ptr + 1;
               while (Ptr > begin && (*Ptr == ' ' || *Ptr == '\t')) --Ptr;
               if (*Ptr == '\\') {
                  auto escapeLoc = Lexer::getSourceLoc(Ptr);
                  bool invalid = true;
                  while (*--Ptr == '\\') invalid = !invalid;
                  if (invalid)
                     Diags->diagnose(escapeLoc, diag::lex_escaped_newline_at_lastline)
                        .fixItRemoveChars(escapeLoc, Lexer::getSourceLoc(LineEnd));
               }
            }

            return StringRef(start, end - start);
         }
         default:
            sawNonWhitespace = true;
      }
   }

   if (sawNonWhitespace && Diags) {
      auto loc = Lexer::getSourceLoc(start + 1);
      Diags->diagnose(loc, diag::lex_illegal_multiline_string_end)
            // FIXME: Should try to suggest indentation.
         .fixItInsert(loc, "\n");
   }

   return "";
}

/// diagnoseInvalidMultilineIndents:
/// Emit errors for a group of multiline indents with the same MistakeOffset.
/// Note: Does not emit an error if MistakeOffset does not lie within
/// ExpectedIndent.
static void diagnoseInvalidMultilineIndents(
   DiagnosticEngine *Diags,
   StringRef ExpectedIndent,
   SourceLoc IndentLoc,
   StringRef Bytes,
   SmallVector<size_t, 4> LineStarts,
   size_t MistakeOffset,
   StringRef ActualIndent) {
   if (MistakeOffset >= ExpectedIndent.size()) {
      // These lines were valid; there's nothing to correct.
      return;
   }

   assert(!LineStarts.empty());

   auto getLoc = [&](size_t offset) -> SourceLoc {
      return Lexer::getSourceLoc((const char *) Bytes.bytes_begin() + offset);
   };
   auto classify = [&](unsigned char ch) -> unsigned {
      switch (ch) {
         case ' ':
            return 0;
         case '\t':
            return 1;
         default:
            return 2;
      }
   };

   Diags->diagnose(getLoc(LineStarts[0] + MistakeOffset),
                   diag::lex_multiline_string_indent_inconsistent,
                   LineStarts.size() != 1, LineStarts.size(),
                   classify(Bytes[LineStarts[0] + MistakeOffset]));

   Diags->diagnose(IndentLoc.getAdvancedLoc(MistakeOffset),
                   diag::lex_multiline_string_indent_should_match_here,
                   classify(ExpectedIndent[MistakeOffset]));

   auto fix = Diags->diagnose(getLoc(LineStarts[0] + MistakeOffset),
                              diag::lex_multiline_string_indent_change_line,
                              LineStarts.size() != 1);

   assert(MistakeOffset <= ActualIndent.size());
   assert(ExpectedIndent.substr(0, MistakeOffset) ==
          ActualIndent.substr(0, MistakeOffset));

   for (auto line : LineStarts) {
      fix.fixItReplaceChars(getLoc(line + MistakeOffset),
                            getLoc(line + ActualIndent.size()),
                            ExpectedIndent.substr(MistakeOffset));
   }
}

/// validateMultilineIndents:
/// Diagnose contents of string literal that have inconsistent indentation.
static void validateMultilineIndents(const Token &Str,
                                     DiagnosticEngine *Diags) {
   StringRef Bytes = getStringLiteralContent(Str);
   StringRef Indent =
      getMultilineTrailingIndent(Bytes, Diags, Str.getCustomDelimiterLen());
   if (Indent.empty())
      return;
   SourceLoc IndentStartLoc = Lexer::getSourceLoc(Indent.data());

   // The offset into the previous line where it experienced its first indentation
   // error, or Indent.size() if every character matched.
   size_t lastMistakeOffset = std::numeric_limits<size_t>::max();
   // Offsets for each consecutive previous line with its first error at
   // lastMatchLength.
   SmallVector<size_t, 4> linesWithLastMistakeOffset = {};
   // Prefix of indentation that's present on all lines in linesWithLastMatchLength.
   StringRef commonIndentation = "";

   for (size_t pos = Bytes.find('\n'); pos != StringRef::npos; pos = Bytes.find('\n', pos + 1)) {
      size_t nextpos = pos + 1;
      auto restOfBytes = Bytes.substr(nextpos);

      // Ignore blank lines.
      if (restOfBytes[0] == '\n' || restOfBytes[0] == '\r') {
         continue;
      }

      // Where is the first difference?
      auto errorOffset = commonPrefixLength(Indent, restOfBytes);

      // Are we starting a new run?
      if (errorOffset != lastMistakeOffset) {
         // Diagnose problems in the just-finished run of lines.
         diagnoseInvalidMultilineIndents(Diags, Indent, IndentStartLoc, Bytes,
                                         linesWithLastMistakeOffset, lastMistakeOffset,
                                         commonIndentation);

         // Set up for a new run.
         lastMistakeOffset = errorOffset;
         linesWithLastMistakeOffset = {};

         // To begin with, all whitespace is part of the common indentation.
         auto prefixLength = restOfBytes.find_first_not_of(" \t");
         commonIndentation = restOfBytes.substr(0, prefixLength);
      } else {
         // We're continuing the run, so include this line in the common prefix.
         auto prefixLength = commonPrefixLength(commonIndentation, restOfBytes);
         commonIndentation = commonIndentation.substr(0, prefixLength);
      }

      // Either way, add this line to the run.
      linesWithLastMistakeOffset.push_back(nextpos);
   }

   // Handle the last run.
   diagnoseInvalidMultilineIndents(Diags, Indent, IndentStartLoc, Bytes,
                                   linesWithLastMistakeOffset, lastMistakeOffset,
                                   commonIndentation);
}

/// Emit diagnostics for single-quote string and suggest replacement
/// with double-quoted equivalent.
static void diagnoseSingleQuoteStringLiteral(const char *TokStart,
                                             const char *TokEnd,
                                             DiagnosticEngine *D) {
   assert(*TokStart == '\'' && TokEnd[-1] == '\'');
   if (!D)
      return;

   SmallString<32> replacement;
   replacement.push_back('"');
   const char *Ptr = TokStart + 1;
   const char *OutputPtr = Ptr;

   while (*Ptr++ != '\'' && Ptr < TokEnd) {
      if (Ptr[-1] == '\\') {
         if (*Ptr == '\'') {
            replacement.append(OutputPtr, Ptr - 1);
            OutputPtr = Ptr + 1;
            // Un-escape single quotes.
            replacement.push_back('\'');
         } else if (*Ptr == '(') {
            // Preserve the contents of interpolation.
            Ptr = skipToEndOfInterpolatedExpression(Ptr + 1, replacement.end(),
               /*IsMultiline=*/false);
            assert(*Ptr == ')');
         }
         // Skip over escaped characters.
         ++Ptr;
      } else if (Ptr[-1] == '"') {
         replacement.append(OutputPtr, Ptr - 1);
         OutputPtr = Ptr;
         // Escape double quotes.
         replacement.append("\\\"");
      }
   }
   assert(Ptr == TokEnd && Ptr[-1] == '\'');
   replacement.append(OutputPtr, Ptr - 1);
   replacement.push_back('"');

   D->diagnose(Lexer::getSourceLoc(TokStart), diag::lex_single_quote_string)
      .fixItReplaceChars(Lexer::getSourceLoc(TokStart),
                         Lexer::getSourceLoc(TokEnd), replacement);
}

/// lexStringLiteral:
///   string_literal ::= ["]([^"\\\n\r]|character_escape)*["]
///   string_literal ::= ["]["]["].*["]["]["] - approximately
///   string_literal ::= (#+)("")?".*"(\2\1) - "raw" strings
void Lexer::lexStringLiteral(unsigned CustomDelimiterLen) {
   const char QuoteChar = CurPtr[-1];
   const char *TokStart = CurPtr - 1 - CustomDelimiterLen;

   // NOTE: We only allow single-quote string literals so we can emit useful
   // diagnostics about changing them to double quotes.
   assert((QuoteChar == '"' || QuoteChar == '\'') && "Unexpected start");

   bool IsMultilineString = advanceIfMultilineDelimiter(CurPtr, Diags);
   if (IsMultilineString && *CurPtr != '\n' && *CurPtr != '\r')
      diagnose(CurPtr, diag::lex_illegal_multiline_string_start)
         .fixItInsert(Lexer::getSourceLoc(CurPtr), "\n");

   bool wasErroneous = false;
   while (true) {
      // Handle string interpolation.
      const char *TmpPtr = CurPtr + 1;
      if (*CurPtr == '\\' &&
          delimiterMatches(CustomDelimiterLen, TmpPtr, nullptr) &&
          *TmpPtr++ == '(') {
         // Consume tokens until we hit the corresponding ')'.
         CurPtr = skipToEndOfInterpolatedExpression(TmpPtr, BufferEnd,
                                                    IsMultilineString);
         if (*CurPtr == ')') {
            // Successfully scanned the body of the expression literal.
            ++CurPtr;
            continue;
         } else if ((*CurPtr == '\r' || *CurPtr == '\n') && IsMultilineString) {
            // The only case we reach here is unterminated single line string in the
            // interpolation. For better recovery, go on after emitting an error.
            diagnose(CurPtr, diag::lex_unterminated_string);
            wasErroneous = true;
            continue;
         } else {
            diagnose(TokStart, diag::lex_unterminated_string);
            return formToken(tok::unknown, TokStart);
         }
      }

      // String literals cannot have \n or \r in them (unless multiline).
      if (((*CurPtr == '\r' || *CurPtr == '\n') && !IsMultilineString)
          || CurPtr == BufferEnd) {
         diagnose(TokStart, diag::lex_unterminated_string);
         return formToken(tok::unknown, TokStart);
      }

      unsigned CharValue = lexCharacter(CurPtr, QuoteChar, true,
                                        IsMultilineString, CustomDelimiterLen);
      // This is the end of string, we are done.
      if (CharValue == ~0U)
         break;

      // Remember we had already-diagnosed invalid characters.
      wasErroneous |= CharValue == ~1U;
   }

   if (QuoteChar == '\'') {
      assert(!IsMultilineString && CustomDelimiterLen == 0 &&
             "Single quoted string cannot have custom delimitor, nor multiline");
      diagnoseSingleQuoteStringLiteral(TokStart, CurPtr, Diags);
   }

   if (wasErroneous)
      return formToken(tok::unknown, TokStart);

   return formStringLiteralToken(TokStart, IsMultilineString,
                                 CustomDelimiterLen);
}


/// We found an opening curly quote in the source file.  Scan ahead until we
/// find and end-curly-quote (or straight one).  If we find what looks to be a
/// string literal, diagnose the problem and return a pointer to the end of the
/// entire string literal.  This helps us avoid parsing the body of the string
/// as program tokens, which will only lead to massive confusion.
const char *Lexer::findEndOfCurlyQuoteStringLiteral(const char *Body,
                                                    bool EmitDiagnostics) {

   while (true) {
      // Don't bother with string interpolations.
      if (*Body == '\\' && *(Body + 1) == '(')
         return nullptr;

      // We didn't find the end of the string literal if we ran to end of line.
      if (*Body == '\r' || *Body == '\n' || Body == BufferEnd)
         return nullptr;

      // Get the next character.
      const char *CharStart = Body;
      unsigned CharValue = lexCharacter(Body, '\0', /*EmitDiagnostics=*/false);
      // If the character was incorrectly encoded, give up.
      if (CharValue == ~1U) return nullptr;

      // If we found a straight-quote, then we're done.  Just return the spot
      // to continue.
      if (CharValue == '"')
         return Body;

      // If we found an ending curly quote (common since this thing started with
      // an opening curly quote) diagnose it with a fixit and then return.
      if (CharValue == 0x0000201D) {
         if (EmitDiagnostics) {
            diagnose(CharStart, diag::lex_invalid_curly_quote)
               .fixItReplaceChars(getSourceLoc(CharStart), getSourceLoc(Body),
                                  "\"");
         }
         return Body;
      }

      // Otherwise, keep scanning.
   }
}


/// lexEscapedIdentifier:
///   identifier ::= '`' identifier '`'
///
/// If it doesn't match this production, the leading ` is a punctuator.
void Lexer::lexEscapedIdentifier() {
   assert(CurPtr[-1] == '`' && "Unexpected start of escaped identifier");

   const char *Quote = CurPtr - 1;

   // Check whether we have an identifier followed by another backtick, in which
   // case this is an escaped identifier.
   const char *IdentifierStart = CurPtr;
   if (advanceIfValidStartOfIdentifier(CurPtr, BufferEnd)) {
      // Keep continuing the identifier.
      while (advanceIfValidContinuationOfIdentifier(CurPtr, BufferEnd));

      // If we have the terminating "`", it's an escaped identifier.
      if (*CurPtr == '`') {
         ++CurPtr;
         formEscapedIdentifierToken(Quote);
         return;
      }
   }

   // Special case; allow '`$`'.
   if (Quote[1] == '$' && Quote[2] == '`') {
      CurPtr = Quote + 3;
      formEscapedIdentifierToken(Quote);
      return;
   }

   // The backtick is punctuation.
   CurPtr = IdentifierStart;
   formToken(tok::backtick, Quote);
}

/// Find the end of a version control conflict marker.
static const char *findConflictEnd(const char *CurPtr, const char *BufferEnd,
                                   ConflictMarkerKind CMK) {
   StringRef terminator = CMK == ConflictMarkerKind::Perforce ? "<<<<\n"
                                                              : ">>>>>>> ";
   size_t termLen = terminator.size();

   // Get a reference to the rest of the buffer minus the length of the start
   // of the conflict marker.
   auto restOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(termLen);
   size_t endPos = restOfBuffer.find(terminator);
   while (endPos != StringRef::npos) {
      // Must occur at start of line.
      if (endPos != 0 &&
          (restOfBuffer[endPos - 1] == '\r' || restOfBuffer[endPos - 1] == '\n')) {
         return restOfBuffer.data() + endPos;
      }
      restOfBuffer = restOfBuffer.substr(endPos + termLen);
      endPos = restOfBuffer.find(terminator);
   }
   return nullptr;
}

bool Lexer::tryLexConflictMarker(bool EatNewline) {
   const char *Ptr = CurPtr - 1;

   // Only a conflict marker if it starts at the beginning of a line.
   if (Ptr != ContentStart && Ptr[-1] != '\n' && Ptr[-1] != '\r')
      return false;

   // Check to see if we have <<<<<<< or >>>>.
   StringRef restOfBuffer(Ptr, BufferEnd - Ptr);
   if (!restOfBuffer.startswith("<<<<<<< ") && !restOfBuffer.startswith(">>>> "))
      return false;

   ConflictMarkerKind Kind = *Ptr == '<' ? ConflictMarkerKind::Normal
                                         : ConflictMarkerKind::Perforce;
   if (const char *End = findConflictEnd(Ptr, BufferEnd, Kind)) {
      // Diagnose at the conflict marker, then jump ahead to the end.
      diagnose(CurPtr, diag::lex_conflict_marker_in_file);
      CurPtr = End;

      // Skip ahead to the end of the marker.
      if (CurPtr != BufferEnd)
         skipToEndOfLine(EatNewline);

      return true;
   }

   // No end of conflict marker found.
   return false;
}

bool Lexer::lexUnknown(bool EmitDiagnosticsIfToken) {
   const char *Tmp = CurPtr - 1;

   if (advanceIfValidContinuationOfIdentifier(Tmp, BufferEnd)) {
      // If this is a valid identifier continuation, but not a valid identifier
      // start, attempt to recover by eating more continuation characters.
      if (EmitDiagnosticsIfToken) {
         diagnose(CurPtr - 1, diag::lex_invalid_identifier_start_character);
      }
      while (advanceIfValidContinuationOfIdentifier(Tmp, BufferEnd));
      CurPtr = Tmp;
      return true;
   }

   // This character isn't allowed in Swift source.
   uint32_t Codepoint = validateUTF8CharacterAndAdvance(Tmp, BufferEnd);
   if (Codepoint == ~0U) {
      diagnose(CurPtr - 1, diag::lex_invalid_utf8)
         .fixItReplaceChars(getSourceLoc(CurPtr - 1), getSourceLoc(Tmp), " ");
      CurPtr = Tmp;
      return false; // Skip presumed whitespace.
   } else if (Codepoint == 0x000000A0) {
      // Non-breaking whitespace (U+00A0)
      while (Tmp[0] == '\xC2' && Tmp[1] == '\xA0')
         Tmp += 2;
      SmallString<8> Spaces;
      Spaces.assign((Tmp - CurPtr + 1) / 2, ' ');
      diagnose(CurPtr - 1, diag::lex_nonbreaking_space)
         .fixItReplaceChars(getSourceLoc(CurPtr - 1), getSourceLoc(Tmp),
                            Spaces);
      CurPtr = Tmp;
      return false;
   } else if (Codepoint == 0x0000201D) {
      // If this is an end curly quote, just diagnose it with a fixit hint.
      if (EmitDiagnosticsIfToken) {
         diagnose(CurPtr - 1, diag::lex_invalid_curly_quote)
            .fixItReplaceChars(getSourceLoc(CurPtr - 1), getSourceLoc(Tmp), "\"");
      }
      CurPtr = Tmp;
      return true;
   } else if (Codepoint == 0x0000201C) {
      auto EndPtr = Tmp;
      // If this is a start curly quote, do a fuzzy match of a string literal
      // to improve recovery.
      if (auto Tmp2 =
         findEndOfCurlyQuoteStringLiteral(Tmp, EmitDiagnosticsIfToken))
         Tmp = Tmp2;

      // Note, we intentionally diagnose the end quote before the start quote,
      // so that the IDE suggests fixing the end quote before the start quote.
      // This, in turn, works better with our error recovery because we won't
      // diagnose an end curly quote in the middle of a straight quoted
      // literal.
      if (EmitDiagnosticsIfToken) {
         diagnose(CurPtr - 1, diag::lex_invalid_curly_quote)
            .fixItReplaceChars(getSourceLoc(CurPtr - 1), getSourceLoc(EndPtr),
                               "\"");
      }
      CurPtr = Tmp;
      return true;
   }

   diagnose(CurPtr - 1, diag::lex_invalid_character)
      .fixItReplaceChars(getSourceLoc(CurPtr - 1), getSourceLoc(Tmp), " ");

   char ExpectedCodepoint;
   if ((ExpectedCodepoint =
           confusable::tryConvertConfusableCharacterToASCII(Codepoint))) {

      llvm::SmallString<4> ConfusedChar;
      EncodeToUTF8(Codepoint, ConfusedChar);
      llvm::SmallString<1> ExpectedChar;
      ExpectedChar += ExpectedCodepoint;
      diagnose(CurPtr - 1, diag::lex_confusable_character, ConfusedChar,
               ExpectedChar)
         .fixItReplaceChars(getSourceLoc(CurPtr - 1), getSourceLoc(Tmp),
                            ExpectedChar);
   }

   CurPtr = Tmp;
   return false; // Skip presumed whitespace.
}

Lexer::NulCharacterKind Lexer::getNulCharacterKind(const char *Ptr) const {
   assert(Ptr != nullptr && *Ptr == 0);
   if (Ptr == CodeCompletionPtr) {
      return NulCharacterKind::CodeCompletion;
   }
   if (Ptr == BufferEnd) {
      return NulCharacterKind::BufferEnd;
   }
   return NulCharacterKind::Embedded;
}

void Lexer::tryLexEditorPlaceholder() {
   assert(CurPtr[-1] == '<' && CurPtr[0] == '#');
   const char *TokStart = CurPtr - 1;
   for (const char *Ptr = CurPtr + 1; Ptr < BufferEnd - 1; ++Ptr) {
      if (*Ptr == '\n')
         break;
      if (Ptr[0] == '<' && Ptr[1] == '#')
         break;
      if (Ptr[0] == '#' && Ptr[1] == '>') {
         // Found it. Flag it as error (or warning, if in playground mode) for the
         // rest of the compiler pipeline and lex it as an identifier.
         if (LangOpts.Playground) {
            diagnose(TokStart, diag::lex_editor_placeholder_in_playground);
         } else {
            diagnose(TokStart, diag::lex_editor_placeholder);
         }
         CurPtr = Ptr + 2;
         formToken(tok::identifier, TokStart);
         return;
      }
   }

   // Not a well-formed placeholder.
   lexOperatorIdentifier();
}

StringRef Lexer::getEncodedStringSegmentImpl(StringRef Bytes,
                                             SmallVectorImpl<char> &TempString,
                                             bool IsFirstSegment,
                                             bool IsLastSegment,
                                             unsigned IndentToStrip,
                                             unsigned CustomDelimiterLen) {

   TempString.clear();
   // Note that it is always safe to read one over the end of "Bytes" because we
   // know that there is a terminating " character (or null byte for an
   // unterminated literal or a segment that doesn't come from source). Use
   // BytesPtr to avoid a range check subscripting on the StringRef.
   const char *BytesPtr = Bytes.begin();

   // Special case when being called from EncodedDiagnosticMessage(...)
   // This should allow multiline strings to work as attribute messages.
   if (IndentToStrip == ~0U)
      IndentToStrip = getMultilineTrailingIndent(Bytes).size();

   bool IsEscapedNewline = false;
   while (BytesPtr < Bytes.end()) {
      char CurChar = *BytesPtr++;

      // Multiline string line ending normalization and indent stripping.
      if (CurChar == '\r' || CurChar == '\n') {
         bool stripNewline = IsEscapedNewline ||
                             (IsFirstSegment && BytesPtr - 1 == Bytes.begin());
         if (CurChar == '\r' && *BytesPtr == '\n')
            ++BytesPtr;
         if (*BytesPtr != '\r' && *BytesPtr != '\n')
            BytesPtr += IndentToStrip;
         if (IsLastSegment && BytesPtr == Bytes.end())
            stripNewline = true;
         if (!stripNewline)
            TempString.push_back('\n');
         IsEscapedNewline = false;
         continue;
      }

      if (CurChar != '\\' ||
          !delimiterMatches(CustomDelimiterLen, BytesPtr, nullptr)) {
         TempString.push_back(CurChar);
         continue;
      }

      // Invalid escapes are accepted by the lexer but diagnosed as an error.  We
      // just ignore them here.
      unsigned CharValue = 0; // Unicode character value for \x, \u, \U.
      switch (*BytesPtr++) {
         default:
            continue;   // Invalid escape, ignore it.

            // Simple single-character escapes.
         case '0':
            TempString.push_back('\0');
            continue;
         case 'n':
            TempString.push_back('\n');
            continue;
         case 'r':
            TempString.push_back('\r');
            continue;
         case 't':
            TempString.push_back('\t');
            continue;
         case '"':
            TempString.push_back('"');
            continue;
         case '\'':
            TempString.push_back('\'');
            continue;
         case '\\':
            TempString.push_back('\\');
            continue;

         case ' ':
         case '\t':
         case '\n':
         case '\r':
            if (maybeConsumeNewlineEscape(BytesPtr, -1)) {
               IsEscapedNewline = true;
               --BytesPtr;
            }
            continue;

            // String interpolation.
         case '(':
            llvm_unreachable("string contained interpolated segments");

            // Unicode escapes of various lengths.
         case 'u':  //  \u HEX HEX HEX HEX
            if (BytesPtr[0] != '{')
               continue;       // Ignore invalid escapes.

            CharValue = lexUnicodeEscape(BytesPtr, /*no diagnostics*/nullptr);
            // Ignore invalid escapes.
            if (CharValue == ~1U) continue;
            break;
      }

      if (CharValue < 0x80)
         TempString.push_back(CharValue);
      else
         EncodeToUTF8(CharValue, TempString);
   }

   // If we didn't escape or reprocess anything, then we don't need to use the
   // temporary string, just point to the original one. We know that this
   // is safe because unescaped strings are always shorter than their escaped
   // forms (in a valid string).
   if (TempString.size() == Bytes.size()) {
      TempString.clear();
      return Bytes;
   }
   return StringRef(TempString.begin(), TempString.size());
}

void Lexer::getStringLiteralSegments(
   const Token &Str,
   SmallVectorImpl<StringSegment> &Segments,
   DiagnosticEngine *Diags) {
   assert(Str.is(tok::string_literal));
   // Get the bytes behind the string literal, dropping any double quotes.
   StringRef Bytes = getStringLiteralContent(Str);

   // Are substitutions required either for indent stripping or line ending
   // normalization?
   bool MultilineString = Str.isMultilineString(), IsFirstSegment = true;
   unsigned IndentToStrip = 0, CustomDelimiterLen = Str.getCustomDelimiterLen();
   if (MultilineString)
      IndentToStrip = getMultilineTrailingIndent(Bytes).size();

   // Note that it is always safe to read one over the end of "Bytes" because
   // we know that there is a terminating " character.  Use BytesPtr to avoid a
   // range check subscripting on the StringRef.
   const char *SegmentStartPtr = Bytes.begin();
   const char *BytesPtr = SegmentStartPtr;
   size_t pos;
   while ((pos = Bytes.find('\\', BytesPtr - Bytes.begin())) != StringRef::npos) {
      BytesPtr = Bytes.begin() + pos + 1;

      if (!delimiterMatches(CustomDelimiterLen, BytesPtr, Diags) ||
          *BytesPtr++ != '(')
         continue;

      // String interpolation.

      // Push the current segment.
      Segments.push_back(
         StringSegment::getLiteral(getSourceLoc(SegmentStartPtr),
                                   BytesPtr - SegmentStartPtr - 2 - CustomDelimiterLen,
                                   IsFirstSegment, false, IndentToStrip,
                                   CustomDelimiterLen));
      IsFirstSegment = false;

      // Find the closing ')'.
      const char *End = skipToEndOfInterpolatedExpression(
         BytesPtr, Str.getText().end(), MultilineString);
      assert(*End == ')' && "invalid string literal interpolations should"
                            " not be returned as string literals");
      ++End;

      // Add an expression segment.
      Segments.push_back(
         StringSegment::getExpr(getSourceLoc(BytesPtr - 1), End - BytesPtr + 1));

      // Reset the beginning of the segment to the string that remains to be
      // consumed.
      SegmentStartPtr = BytesPtr = End;
   }

   Segments.push_back(
      StringSegment::getLiteral(getSourceLoc(SegmentStartPtr),
                                Bytes.end() - SegmentStartPtr,
                                IsFirstSegment, true, IndentToStrip,
                                CustomDelimiterLen));
}


//===----------------------------------------------------------------------===//
// Main Lexer Loop
//===----------------------------------------------------------------------===//

void Lexer::lexImpl() {
   assert(CurPtr >= BufferStart &&
          CurPtr <= BufferEnd && "Current pointer out of range!");

   LeadingTrivia.clear();
   TrailingTrivia.clear();

   if (CurPtr == BufferStart) {
      if (BufferStart < ContentStart) {
         size_t BOMLen = ContentStart - BufferStart;
         assert(BOMLen == 3 && "UTF-8 BOM is 3 bytes");
         // Add UTF-8 BOM to LeadingTrivia.
         LeadingTrivia.push_back(TriviaKind::GarbageText, BOMLen);
         CurPtr += BOMLen;
      }
      NextToken.setAtStartOfLine(true);
   } else {
      NextToken.setAtStartOfLine(false);
   }

   lexTrivia(LeadingTrivia, /* IsForTrailingTrivia */ false);

   // Remember the start of the token so we can form the text range.
   const char *TokStart = CurPtr;

   switch ((signed char) *CurPtr++) {
      default: {
         char const *Tmp = CurPtr - 1;
         if (advanceIfValidStartOfIdentifier(Tmp, BufferEnd))
            return lexIdentifier();

         if (advanceIfValidStartOfOperator(Tmp, BufferEnd))
            return lexOperatorIdentifier();

         bool ShouldTokenize = lexUnknown(/*EmitDiagnosticsIfToken=*/true);
         assert(
            ShouldTokenize &&
            "Invalid UTF-8 sequence should be eaten by lexTrivia as LeadingTrivia");
         (void) ShouldTokenize;
         return formToken(tok::unknown, TokStart);
      }

      case '\n':
      case '\r':
         llvm_unreachable("Newlines should be eaten by lexTrivia as LeadingTrivia");

      case ' ':
      case '\t':
      case '\f':
      case '\v':
         llvm_unreachable(
            "Whitespaces should be eaten by lexTrivia as LeadingTrivia");

      case -1:
      case -2:
         diagnose(CurPtr - 1, diag::lex_utf16_bom_marker);
         CurPtr = BufferEnd;
         return formToken(tok::unknown, TokStart);

      case 0:
         switch (getNulCharacterKind(CurPtr - 1)) {
            case NulCharacterKind::CodeCompletion:
               while (advanceIfValidContinuationOfIdentifier(CurPtr, BufferEnd));
               return formToken(tok::code_complete, TokStart);

            case NulCharacterKind::BufferEnd:
               // This is the real end of the buffer.
               // Put CurPtr back into buffer bounds.
               --CurPtr;
               // Return EOF.
               return formToken(tok::eof, TokStart);

            case NulCharacterKind::Embedded:
               llvm_unreachable(
                  "Embedded nul should be eaten by lexTrivia as LeadingTrivia");
         }

      case '@':
         return formToken(tok::at_sign, TokStart);
      case '{':
         return formToken(tok::l_brace, TokStart);
      case '[':
         return formToken(tok::l_square, TokStart);
      case '(':
         return formToken(tok::l_paren, TokStart);
      case '}':
         return formToken(tok::r_brace, TokStart);
      case ']':
         return formToken(tok::r_square, TokStart);
      case ')':
         return formToken(tok::r_paren, TokStart);

      case ',':
         return formToken(tok::comma, TokStart);
      case ';':
         return formToken(tok::semi, TokStart);
      case ':':
         return formToken(tok::colon, TokStart);
      case '\\':
         return formToken(tok::backslash, TokStart);

      case '#':
         if (unsigned CustomDelimiterLen = advanceIfCustomDelimiter(CurPtr, Diags))
            return lexStringLiteral(CustomDelimiterLen);
         return lexHash();

         // Operator characters.
      case '/':
         if (CurPtr[0] == '/') {  // "//"
            skipSlashSlashComment(/*EatNewline=*/true);
            assert(isKeepingComments() &&
                   "Non token comment should be eaten by lexTrivia as LeadingTrivia");
            return formToken(tok::comment, TokStart);
         }
         if (CurPtr[0] == '*') { // "/*"
            skipSlashStarComment();
            assert(isKeepingComments() &&
                   "Non token comment should be eaten by lexTrivia as LeadingTrivia");
            return formToken(tok::comment, TokStart);
         }
         return lexOperatorIdentifier();
      case '%':
         // Lex %[0-9a-zA-Z_]+ as a local PIL value
         if (InPILBody && clang::isIdentifierBody(CurPtr[0])) {
            do {
               ++CurPtr;
            } while (clang::isIdentifierBody(CurPtr[0]));

            return formToken(tok::pil_local_name, TokStart);
         }
         return lexOperatorIdentifier();

      case '!':
         if (InPILBody)
            return formToken(tok::pil_exclamation, TokStart);
         if (isLeftBound(TokStart, ContentStart))
            return formToken(tok::exclaim_postfix, TokStart);
         return lexOperatorIdentifier();

      case '?':
         if (isLeftBound(TokStart, ContentStart))
            return formToken(tok::question_postfix, TokStart);
         return lexOperatorIdentifier();

      case '<':
         if (CurPtr[0] == '#')
            return tryLexEditorPlaceholder();

         return lexOperatorIdentifier();
      case '>':
         return lexOperatorIdentifier();

      case '=':
      case '-':
      case '+':
      case '*':
      case '&':
      case '|':
      case '^':
      case '~':
      case '.':
         return lexOperatorIdentifier();

      case 'A':
      case 'B':
      case 'C':
      case 'D':
      case 'E':
      case 'F':
      case 'G':
      case 'H':
      case 'I':
      case 'J':
      case 'K':
      case 'L':
      case 'M':
      case 'N':
      case 'O':
      case 'P':
      case 'Q':
      case 'R':
      case 'S':
      case 'T':
      case 'U':
      case 'V':
      case 'W':
      case 'X':
      case 'Y':
      case 'Z':
      case 'a':
      case 'b':
      case 'c':
      case 'd':
      case 'e':
      case 'f':
      case 'g':
      case 'h':
      case 'i':
      case 'j':
      case 'k':
      case 'l':
      case 'm':
      case 'n':
      case 'o':
      case 'p':
      case 'q':
      case 'r':
      case 's':
      case 't':
      case 'u':
      case 'v':
      case 'w':
      case 'x':
      case 'y':
      case 'z':
      case '_':
         return lexIdentifier();

      case '$':
         return lexDollarIdent();

      case '0':
      case '1':
      case '2':
      case '3':
      case '4':
      case '5':
      case '6':
      case '7':
      case '8':
      case '9':
         return lexNumber();

      case '"':
      case '\'':
         return lexStringLiteral();

      case '`':
         return lexEscapedIdentifier();
   }
}

Token Lexer::getTokenAtLocation(const SourceManager &SM, SourceLoc Loc) {
   // Don't try to do anything with an invalid location.
   if (!Loc.isValid())
      return Token();

   // Figure out which buffer contains this location.
   int BufferID = SM.findBufferContainingLoc(Loc);
   if (BufferID < 0)
      return Token();

   // Use fake language options; language options only affect validity
   // and the exact token produced.
   LangOptions FakeLangOpts;

   // Here we return comments as tokens because either the caller skipped
   // comments and normally we won't be at the beginning of a comment token
   // (making this option irrelevant), or the caller lexed comments and
   // we need to lex just the comment token.
   Lexer L(FakeLangOpts, SM, BufferID, nullptr, LexerMode::Polarphp,
           HashbangMode::Allowed, CommentRetentionMode::ReturnAsTokens);
   L.restoreState(State(Loc));
   return L.peekNextToken();
}

void Lexer::lexTrivia(ParsedTrivia &Pieces, bool IsForTrailingTrivia) {
Restart:
   const char *TriviaStart = CurPtr;

   switch ((signed char) *CurPtr++) {
      case '\n':
         if (IsForTrailingTrivia)
            break;
         NextToken.setAtStartOfLine(true);
         Pieces.appendOrSquash(TriviaKind::Newline, 1);
         goto Restart;
      case '\r':
         if (IsForTrailingTrivia)
            break;
         NextToken.setAtStartOfLine(true);
         if (CurPtr[0] == '\n') {
            Pieces.appendOrSquash(TriviaKind::CarriageReturnLineFeed, 2);
            ++CurPtr;
         } else {
            Pieces.appendOrSquash(TriviaKind::CarriageReturn, 1);
         }
         goto Restart;
      case ' ':
         Pieces.appendOrSquash(TriviaKind::Space, 1);
         goto Restart;
      case '\t':
         Pieces.appendOrSquash(TriviaKind::Tab, 1);
         goto Restart;
      case '\v':
         Pieces.appendOrSquash(TriviaKind::VerticalTab, 1);
         goto Restart;
      case '\f':
         Pieces.appendOrSquash(TriviaKind::Formfeed, 1);
         goto Restart;
      case '/':
         if (IsForTrailingTrivia || isKeepingComments()) {
            // Don't lex comments as trailing trivias (for now).
            // Don't try to lex comments here if we are lexing comments as Tokens.
            break;
         } else if (*CurPtr == '/') {
            // '// ...' comment.
            bool isDocComment = CurPtr[1] == '/';
            skipSlashSlashComment(/*EatNewline=*/false);
            size_t Length = CurPtr - TriviaStart;
            Pieces.push_back(isDocComment ? TriviaKind::DocLineComment
                                          : TriviaKind::LineComment, Length);
            goto Restart;
         } else if (*CurPtr == '*') {
            // '/* ... */' comment.
            bool isDocComment = CurPtr[1] == '*';
            skipSlashStarComment();
            size_t Length = CurPtr - TriviaStart;
            Pieces.push_back(isDocComment ? TriviaKind::DocBlockComment
                                          : TriviaKind::BlockComment, Length);
            goto Restart;
         }
         break;
      case '#':
         if (TriviaStart == ContentStart && *CurPtr == '!') {
            // Hashbang '#!/path/to/swift'.
            --CurPtr;
            if (!IsHashbangAllowed)
               diagnose(TriviaStart, diag::lex_hashbang_not_allowed);
            skipHashbang(/*EatNewline=*/false);
            size_t Length = CurPtr - TriviaStart;
            Pieces.push_back(TriviaKind::GarbageText, Length);
            goto Restart;
         }
         break;
      case '<':
      case '>':
         if (tryLexConflictMarker(/*EatNewline=*/false)) {
            // Conflict marker.
            size_t Length = CurPtr - TriviaStart;
            Pieces.push_back(TriviaKind::GarbageText, Length);
            goto Restart;
         }
         break;
      case 0:
         switch (getNulCharacterKind(CurPtr - 1)) {
            case NulCharacterKind::Embedded: {
               diagnoseEmbeddedNul(Diags, CurPtr - 1);
               size_t Length = CurPtr - TriviaStart;
               Pieces.push_back(TriviaKind::GarbageText, Length);
               goto Restart;
            }
            case NulCharacterKind::CodeCompletion:
            case NulCharacterKind::BufferEnd:
               break;
         }
         break;
         // Start character of tokens.
      case -1:
      case -2:
      case '@':
      case '{':
      case '[':
      case '(':
      case '}':
      case ']':
      case ')':
      case ',':
      case ';':
      case ':':
      case '\\':
      case '$':
      case '0':
      case '1':
      case '2':
      case '3':
      case '4':
      case '5':
      case '6':
      case '7':
      case '8':
      case '9':
      case '"':
      case '\'':
      case '`':
         // Start of identifiers.
      case 'A':
      case 'B':
      case 'C':
      case 'D':
      case 'E':
      case 'F':
      case 'G':
      case 'H':
      case 'I':
      case 'J':
      case 'K':
      case 'L':
      case 'M':
      case 'N':
      case 'O':
      case 'P':
      case 'Q':
      case 'R':
      case 'S':
      case 'T':
      case 'U':
      case 'V':
      case 'W':
      case 'X':
      case 'Y':
      case 'Z':
      case 'a':
      case 'b':
      case 'c':
      case 'd':
      case 'e':
      case 'f':
      case 'g':
      case 'h':
      case 'i':
      case 'j':
      case 'k':
      case 'l':
      case 'm':
      case 'n':
      case 'o':
      case 'p':
      case 'q':
      case 'r':
      case 's':
      case 't':
      case 'u':
      case 'v':
      case 'w':
      case 'x':
      case 'y':
      case 'z':
      case '_':
         // Start of operators.
      case '%':
      case '!':
      case '?':
      case '=':
      case '-':
      case '+':
      case '*':
      case '&':
      case '|':
      case '^':
      case '~':
      case '.':
         break;
      default:
         const char *Tmp = CurPtr - 1;
         if (advanceIfValidStartOfIdentifier(Tmp, BufferEnd)) {
            break;
         }
         if (advanceIfValidStartOfOperator(Tmp, BufferEnd)) {
            break;
         }

         bool ShouldTokenize = lexUnknown(/*EmitDiagnosticsIfToken=*/false);
         if (ShouldTokenize) {
            CurPtr = Tmp;
            return;
         }

         size_t Length = CurPtr - TriviaStart;
         Pieces.push_back(TriviaKind::GarbageText, Length);
         goto Restart;
   }
   // Reset the cursor.
   --CurPtr;
}

SourceLoc Lexer::getLocForEndOfToken(const SourceManager &SM, SourceLoc Loc) {
   return Loc.getAdvancedLocOrInvalid(getTokenAtLocation(SM, Loc).getLength());
}


static SourceLoc getLocForStartOfTokenInBuf(SourceManager &SM,
                                            unsigned BufferID,
                                            unsigned Offset,
                                            unsigned BufferStart,
                                            unsigned BufferEnd) {
   // Use fake language options; language options only affect validity
   // and the exact token produced.
   LangOptions FakeLangOptions;

   Lexer L(FakeLangOptions, SM, BufferID, nullptr, LexerMode::Polarphp,
           HashbangMode::Allowed, CommentRetentionMode::None,
           TriviaRetentionMode::WithoutTrivia, BufferStart, BufferEnd);

   // Lex tokens until we find the token that contains the source location.
   Token Tok;
   do {
      L.lex(Tok);

      unsigned TokOffs = SM.getLocOffsetInBuffer(Tok.getLoc(), BufferID);
      if (TokOffs > Offset) {
         // We ended up skipping over the source location entirely, which means
         // that it points into whitespace. We are done here.
         break;
      }

      if (Offset < TokOffs + Tok.getLength()) {
         // Current token encompasses our source location.

         if (Tok.is(tok::string_literal)) {
            SmallVector<Lexer::StringSegment, 4> Segments;
            Lexer::getStringLiteralSegments(Tok, Segments, /*Diags=*/nullptr);
            for (auto &Seg : Segments) {
               unsigned SegOffs = SM.getLocOffsetInBuffer(Seg.Loc, BufferID);
               unsigned SegEnd = SegOffs + Seg.Length;
               if (SegOffs > Offset)
                  break;

               // If the offset is inside an interpolated expr segment, re-lex.
               if (Seg.Kind == Lexer::StringSegment::Expr && Offset < SegEnd)
                  return getLocForStartOfTokenInBuf(SM, BufferID, Offset,
                     /*BufferStart=*/SegOffs,
                     /*BufferEnd=*/SegEnd);
            }
         }

         return Tok.getLoc();
      }
   } while (Tok.isNot(tok::eof));

   // We've passed our source location; just return the original source location.
   return SM.getLocForOffset(BufferID, Offset);
}

// Find the start of the given line.
static const char *findStartOfLine(const char *bufStart, const char *current) {
   while (current != bufStart) {
      if (current[0] == '\n' || current[0] == '\r') {
         ++current;
         break;
      }

      --current;
   }

   return current;
}

SourceLoc Lexer::getLocForStartOfToken(SourceManager &SM, SourceLoc Loc) {
   if (!Loc.isValid())
      return SourceLoc();
   unsigned BufferId = SM.findBufferContainingLoc(Loc);
   return getLocForStartOfToken(SM, BufferId,
                                SM.getLocOffsetInBuffer(Loc, BufferId));
}

SourceLoc Lexer::getLocForStartOfToken(SourceManager &SM, unsigned BufferID,
                                       unsigned Offset) {
   CharSourceRange entireRange = SM.getRangeForBuffer(BufferID);
   StringRef Buffer = SM.extractText(entireRange);

   const char *BufStart = Buffer.data();
   if (Offset > Buffer.size())
      return SourceLoc();

   const char *StrData = BufStart + Offset;
   // If it points to whitespace return the SourceLoc for it.
   if (StrData[0] == '\n' || StrData[0] == '\r' ||
       StrData[0] == ' ' || StrData[0] == '\t')
      return SM.getLocForOffset(BufferID, Offset);

   // Back up from the current location until we hit the beginning of a line
   // (or the buffer). We'll relex from that point.
   const char *LexStart = findStartOfLine(BufStart, StrData);

   return getLocForStartOfTokenInBuf(SM, BufferID, Offset,
      /*BufferStart=*/LexStart - BufStart,
      /*BufferEnd=*/Buffer.size());
}

SourceLoc Lexer::getLocForStartOfLine(SourceManager &SM, SourceLoc Loc) {
   // Don't try to do anything with an invalid location.
   if (Loc.isInvalid())
      return Loc;

   // Figure out which buffer contains this location.
   int BufferID = SM.findBufferContainingLoc(Loc);
   if (BufferID < 0)
      return SourceLoc();

   CharSourceRange entireRange = SM.getRangeForBuffer(BufferID);
   StringRef Buffer = SM.extractText(entireRange);

   const char *BufStart = Buffer.data();
   unsigned Offset = SM.getLocOffsetInBuffer(Loc, BufferID);

   const char *StartOfLine = findStartOfLine(BufStart, BufStart + Offset);
   return getSourceLoc(StartOfLine);
}

SourceLoc Lexer::getLocForEndOfLine(SourceManager &SM, SourceLoc Loc) {
   // Don't try to do anything with an invalid location.
   if (Loc.isInvalid())
      return Loc;

   // Figure out which buffer contains this location.
   int BufferID = SM.findBufferContainingLoc(Loc);
   if (BufferID < 0)
      return SourceLoc();

   // Use fake language options; language options only affect validity
   // and the exact token produced.
   LangOptions FakeLangOpts;

   // Here we return comments as tokens because either the caller skipped
   // comments and normally we won't be at the beginning of a comment token
   // (making this option irrelevant), or the caller lexed comments and
   // we need to lex just the comment token.
   Lexer L(FakeLangOpts, SM, BufferID, nullptr, LexerMode::Polarphp,
           HashbangMode::Allowed, CommentRetentionMode::ReturnAsTokens);
   L.restoreState(State(Loc));
   L.skipToEndOfLine(/*EatNewline=*/true);
   return getSourceLoc(L.CurPtr);
}

StringRef Lexer::getIndentationForLine(SourceManager &SM, SourceLoc Loc,
                                       StringRef *ExtraIndentation) {
   // FIXME: do something more intelligent here.
   //
   // Four spaces is the typical indentation in Swift code, so for now just use
   // that directly here, but if someone was to do something better, updating
   // here will update everyone.

   if (ExtraIndentation)
      *ExtraIndentation = "    ";

   // Don't try to do anything with an invalid location.
   if (Loc.isInvalid())
      return "";

   // Figure out which buffer contains this location.
   int BufferID = SM.findBufferContainingLoc(Loc);
   if (BufferID < 0)
      return "";

   CharSourceRange entireRange = SM.getRangeForBuffer(BufferID);
   StringRef Buffer = SM.extractText(entireRange);

   const char *BufStart = Buffer.data();
   unsigned Offset = SM.getLocOffsetInBuffer(Loc, BufferID);

   const char *StartOfLine = findStartOfLine(BufStart, BufStart + Offset);
   const char *EndOfIndentation = StartOfLine;
   while (*EndOfIndentation && isHorizontalWhitespace(*EndOfIndentation))
      ++EndOfIndentation;

   return StringRef(StartOfLine, EndOfIndentation - StartOfLine);
}

ArrayRef<Token> slice_token_array(ArrayRef<Token> AllTokens, SourceLoc StartLoc,
                                  SourceLoc EndLoc) {
   assert(StartLoc.isValid() && EndLoc.isValid());
   auto StartIt = token_lower_bound(AllTokens, StartLoc);
   auto EndIt = token_lower_bound(AllTokens, EndLoc);
   assert(StartIt->getLoc() == StartLoc && EndIt->getLoc() == EndLoc);
   return AllTokens.slice(StartIt - AllTokens.begin(), EndIt - StartIt + 1);
}

} // polar::llparser